csv_reader read N lines at a time - python-3.x

I have to read a CSV file N lines at a time.
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
print row
I know I can loop N times at a time, build a list of list and process that way.
But is there a simpler way of using csv_reader so that I read n lines at a time.

Hi I don't think that you'll be able to do that without a loop with csv package.
You should use pandas (pip install --user pandas) instead:
import pandas
df = pandas.read_csv('myfile.csv')
start = 0
step = 2 # Your 'N'
for i in range(0, len(df), step):
print(df[i:i+step])
start = i

Pandas has a chunksize option to their read_csv() method and I would probably explore that option.
If I was going to do it myself by hand, I would probably do something like:
import csv
def process_batch(rows):
print(rows)
def get_batch(reader, batch_size):
return [row for _ in range(batch_size) if (row:=next(reader, None))]
with open("data.csv", "r") as file_in:
reader = csv.reader(file_in)
while batch := get_batch(reader, 5):
process_batch(batch)

Related

How to convert the 50000 txt file into csv

I have many text files. I tried to convert the txt files into a single CSV file, but it is taking a huge time. I put the code on run mode at night and I slept, it processed only 4500 files, but still morning it is running.
There is any way to fast way to convert the text files into csv?
Here is my code:
import pandas as pd
import os
import glob
from tqdm import tqdm
# create empty dataframe
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
# get list of files
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
for filename in tqdm(file_list):
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in tqdm(datafile):
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
Here is my example text file.
ID 0xb379
Delivery_person_ID BANGRES18DEL02
Delivery_person_Age 34.000000
Delivery_person_Ratings 4.500000
Restaurant_latitude 12.913041
Restaurant_longitude 77.683237
Delivery_location_latitude 13.043041
Delivery_location_longitude 77.813237
Order_Date 25-03-2022
Time_Orderd 19:45
Time_Order_picked 19:50
Weather conditions Stormy
Road_traffic_density Jam
Vehicle_condition 2
Type_of_order Snack
Type_of_vehicle scooter
multiple_deliveries 1.000000
Festival No
City Metropolitian
Time_taken (min) 33.000000
CSV is a very simple data format for which you don't need any sophisticated tools to handle. Just text and separators.
In your hopefully simple case there is no need to use pandas and dictionaries.
Except your datafiles are corrupt missing some columns or having some additional columns to skip. But even in this case you can handle such issues better within your own code so you have more control over it and are able to get results within seconds.
Assuming your datafiles are not corrupt having all columns in the right order with no missing columns or having additional ones (so you can rely on their proper formatting), just try this code:
from time import perf_counter as T
sT = T()
filesProcessed = 0
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
import glob, os
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
csv_lines = []
csv_line_counter = 0
for filename in file_list:
filesProcessed += 1
with open(filename) as datafile:
csv_line = ""
for line in datafile.read().splitlines():
# print(line)
var = line.partition(" ")[-1]
csv_line += var.strip() + ';'
csv_lines.append(str(csv_line_counter)+';'+csv_line[:-1])
csv_line_counter += 1
with open("train.csv", "w") as csvfile:
csvfile.write(';'+';'.join(columns)+'\n')
csvfile.write('\n'.join(csv_lines))
eT = T()
print(f'> {filesProcessed=}, {(eT-sT)=:8.6f}')
I guess you will get the result in a speed beyond your expectations (in seconds, not minutes or hours)
On my computer, estimating from processing time of 100 files the time required for 50.000 files will be about 3 seconds.
I could not replicate. I took the example data file and created 5000 copies of it. Then I ran your code using tqdm and without. The below shows without:
import time
import csv
import os
import glob
import pandas as pd
from tqdm import tqdm
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
file_list = glob.glob(os.path.join(os.getcwd(), "sample_files/", "*.txt"))
t1 = time.time()
for filename in file_list:
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in datafile:
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
t2 = time.time()
print(t2-t1)
The times I got where:
tqdm 33 seconds
no tqdm 34 seconds
Then I ran using the csv module:
t1 = time.time()
with open('output.csv', 'a', newline='') as csv_file:
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
mydict = {}
d_Writer = csv.DictWriter(csv_file, fieldnames=columns, delimiter=',')
d_Writer.writeheader()
for filename in file_list:
with open(filename) as datafile:
for line in datafile:
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
d_Writer.writerow(mydict)
t2 = time.time()
print(t2-t1)
The time for this was:
csv 0.32231569290161133 seconds.
Try it like this.
import glob
with open('my_file.csv', 'a') as csv_file:
for path in glob.glob('./*.txt'):
with open(path) as txt_file:
txt = txt_file.read() + '\n'
csv_file.write(txt)

How convert epoch time to string?

I have a data array 10000*3 that need to save as a csv file.Below is my pseudo code, but I don't know how to achive it only with numpy(not pandas).Can someone know how to do?
import numpy as np
import time
arr = np.random.randn(10000, 3)
# That need arr[0,0] = time.time(), arr[i, 0] = time.time() + i
arr[:,0] = time.time() + 1
# And after that, I need to column 1 to datatime string(like "2021-02-12 12:12:12")
arr[:,0] need to apply `time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item))` this function.
# After that, the data type is, column1:str, column2:float, column3:float and to save it as a csv file.
This can be done with import csv (pre-installed with python default packages). See this tutorial for examples: https://www.pythontutorial.net/python-basics/python-write-csv-file/
with open('countries.csv', 'w', encoding='UTF8') as f:
writer = csv.writer(f)
# write the header
writer.writerow(header)
# write the data
for row in arr:
writer.writerow(row)

Python: Read large CSV in chunk

Requirement: Read large CSV file (>1million rows) in chunk
Issue: Sometimes the generator yields the same set of rows twice even though the file has unique rows. But some runs it looks fine with no duplicates
Looks like I am missing something in the code, I am not able to figure out
Want to make sure it doesn't yield the same object over and over with different contents
Code:
def gen_chunks(self,reader, chunksize=100000):
chunk = []
for i, line in enumerate(reader):
if (i % chunksize == 0 and i > 0):
yield list(map(tuple, chunk))
chunk = []
chunk.append(line)
yield list(map(tuple, chunk))
def execute(self, context):
with tempfile.NamedTemporaryFile() as f_source:
s3_client.download_file(self.s3_bucket, self.s3_key, f_source.name)
with open(f_source.name, 'r') as f:
csv_reader = csv.reader(f, delimiter='|')
for chunk in self.gen_chunks(csv_reader):
logger.info('starting in chunk process')
orcl.bulk_insert_rows(table=self.oracle_table,rows=chunk, target_fields=self.target_fields, commit_every=10000)
Idk if you have an option to try pandas, if yes then this could possibly be your answer.
I find pandas faster when working with millions of records in a csv,
here is some code that will help you
import pandas as pd
chunks = pd.read_csv(f_source.name, delimiter="|", chunksize=100000)
for chunk in chunks:
for row in chunk.values:
print(row)
pandas provides a lot of options with read_csv :
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

Can't run while loop through csv files using python

I'm having some trouble looping through the code below., Essentially, I am attempting to loop through a folder of files ("filename_x.csv") and remove all commas out within the fields, then I would like for the output to be saved with the "out". The code works like a charm if I input the counter individually, but it will not move to the next count. I know this is very close to functional status, but the 'with' statements may allow the other files to close, so it can't move to the next count or iteration (file path is just an example). Please, help!
import pandas as pd
n = 1
x = str(n)
while n < 9:
import csv
with open("C:\\Desktop\\server_"+x+".csv",'r', newline='') as infile, open("C:\\Desktop\\server_"+x+"_out.csv",'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
writer.writerow(item.replace(",", "") for item in row)
n += 1
The line:
x = str(n)
is in the wrong place it needs to be in the loop at the top.
You only set at to the initial value of n and then it stays that way.
You do not actually even need it. Just put the str(n) directly in the string creation like this:
"C:\\Desktop\\server_" + str(n) + ".csv"
I would also recommend you use a different loop structure. The while format you are using does not make sense for the way you are using it. You should be using a for loop, like this:
import pandas as pd
import csv
for n in range(1, 9):
with open("C:\\Desktop\\server_" + str(n) + ".csv",'r', newline='') as infile, open("C:\\Desktop\\server_" + str(n) + "_out.csv",'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
writer.writerow(item.replace(",", "") for item in row)
This fixes a few things. You simplify the code, but also there were some other issues. You had the increment of n in the wrong place which this corrects. Also you were doing the import cvs inside the loop which would probably not break anything, but would at the very least be inefficient since it would attempt reload the cvs module every pass through the outer loop.
Asssuming you have files server_1.csv, server_2.csv ...
Two things:
you don't need to keep track of both x and n
you need to increment n in the right place. You are currently incrementing it once per row.
import pandas as pd
import csv # <<< no need to repeat the import statement though it's harmless
n = 1
while n < 9:
with open("C:\\Desktop\\server_" + str(n) + ".csv",'r', newline='') as infile, open("C:\\Desktop\\server_" + str(n) + "_out.csv",'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
writer.writerow(item.replace(",", "") for item in row)
# finished with this file, increment n
n += 1

How to read first 1000 entries in a csv file

I have a csv file containing 60,000 entries. I read them and store in a nested list like this:
entries = []
with open('mnist_train.csv', 'r') as f:
mycsv = csv.reader(f)
for row in mycsv:
entries.append(row)
Instead of reading all 60,000 how would I read only the first thousand entries?
I tried this without success:
entries = []
with open('mnist_train.csv', 'r') as f:
mycsv = csv.reader(f)
for row in mycsv[:1000]:
entries.append(row)
As you've discovered a csv.reader does not support slicing. You can use itertools.islice() to accomplish this with objects that are iterable. E.g.,
import itertools
entries = []
with open('mnist_train.csv', 'r') as f:
mycsv = csv.reader(f)
for row in itertools.islice(mycsv, 1000):
entries.append(row)
You can use the pandas library-
import pandas as pd
data = pd.read_csv('path/to/your/file.csv',nrows=1000)
data_list = data.values.tolist() #creates a list of the first 1000 rows (excludes header)

Resources