python 3.X concatenate zipped csv files to one non-zipped csv file - python-3.x

here is my python 3 code:
import zipfile
import os
import time
from timeit import default_timer as timer
import re
import glob
import pandas as pd
# local variabless
# pc version
# the_dir = r'c:\ImpExpData'
# linux version
the_dir = '/home/ralph/Documents/lulumcusb/ImpExpData/Exports/92-95'
def main():
"""
this is the function that controls the processing
"""
start_time = timer()
for root, dirs, files in os.walk(the_dir):
for file in files:
if file.endswith(".zip"):
print("working dir is ...", the_dir)
zipPath = os.path.join(root, file)
z = zipfile.ZipFile(zipPath, "r")
for filename in z.namelist():
if filename.endswith(".csv"):
# print filename
if re.match(r'^Trade-Geo.*\.csv$', filename):
pass # do somethin with geo file
# print " Geo data: " , filename
elif re.match(r'^Trade-Metadata.*\.csv$', filename):
pass # do something with metadata file
# print "Metadata: ", filename
else:
try:
with zipfile.ZipFile(zipPath) as z:
with z.open(filename) as f:
# print("send to test def...", filename)
# print(zipPath)
with zipfile.ZipFile(zipPath) as z:
with z.open(filename) as f:
frame = pd.DataFrame()
# EmptyDataError: No columns to parse from file -- how to deal with this error
train_df = read_csv(f, index_col=None, header=0, skiprows=1, encoding="cp1252")
# train_df = pd.read_csv(f, header=0, skiprows=1, delimiter=",", encoding="cp1252")
list_ = []
list_.append(train_df)
# print(list_)
frame = pd.concat(list_, ignore_index=True)
frame.to_csv('/home/ralph/Documents/lulumcusb/ImpExpData/Exports/concat_test.csv', encoding='cp1252') # works
except: # catches EmptyDataError: No columns to parse from file
print("EmptyDataError...." ,filename, "...", zipPath)
# GetSubDirList(the_dir)
end_time = timer()
print("Elapsed time was %g seconds" % (end_time - start_time))
if __name__ == '__main__':
main()
it mostly works -- only it does not concatenate all zipped csv files into one. there is one empty file and all csv files have the same field structure with all csv files varing in number of rows.
here is what spyder reports when i run it:
runfile('/home/ralph/Documents/lulumcusb/Sep15_cocncatCSV.py', wdir='/home/ralph/Documents/lulumcusb')
working dir is ... /home/ralph/Documents/lulumcusb/ImpExpData/Exports/92-95
EmptyDataError.... Trade-Exports-Chp-77.csv ... /home/ralph/Documents/lulumcusb/ImpExpData/Exports/92-95/Trade-Exports-Yr1992-1995.zip
/home/ralph/anaconda3/lib/python3.6/site-packages/spyder/utils/site/sitecustomize.py:688: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.
execfile(filename, namespace)
Elapsed time was 104.857 seconds
the final csvfile is the last zipped csv file processed; the csv file changes in size as it processes the files
there are 99 csv files in the zipped file that i wish to concat into one non-zipped csv file
the field or column names are:
colmNames = ["hs_code", "uom", "country", "state", "prov", "value", "quatity", "year", "month"]
the csvfiles are labled: chp01.csv, cht02.csv, etc to chp99.csv with the "uom" (unit of measure) being either empty, or an integer or a string depending on the hs_code
Question: how do I get the zipped csv files to get concatenated into one large(estimated 100 mb uncompressed) csv file?
added details:
i am trying not to unzip the csv files, i would then have to go an delete them. I need to concat files because i have additional processing to do. The extracting of the zipped csv files is a viable option, i was hoping not having to do that

Is there any reason you don't want to do this with your shell?
Assuming the order in which you concatenate is irrelevant:
cd "/home/ralph/Documents/lulumcusb/ImpExpData/Exports/92-95"
unzip "Trade-Exports-Yr1992-1995.zip" -d unzipped && cd unzipped
for f in Trade-Exports-Chp*.csv; do tail --lines=+2 "$f" >> concat.csv; done
This removes the first line (column names) from each csv file before appending to concat.csv.
If you just did:
tail --lines=+2 "Trade-Exports-Chp*.csv" > concat.csv
You'd end up with:
==> Trade-Exports-Chp-1.csv <==
...
==> Trade-Exports-Chp-10.csv <==
...
==> Trade-Exports-Chp-2.csv <==
...
etc.
If you care about the order, change Trade-Exports-Chp-1.csv .. Trade-Exports-Chp-9.csv to Trade-Exports-Chp-01.csv .. Trade-Exports-Chp-09.csv.
Although it's doable in Python I don't think it's the right tool for the job in this case.
If you want to do the job in place without actually extracting the zip file:
for i in {1..99}; do
unzip -p "Trade-Exports-Yr1992-1995.zip" "Trade-Exports-Chp$i.csv" | tail --lines=+2 >> concat.csv
done

Related

How to convert the 50000 txt file into csv

I have many text files. I tried to convert the txt files into a single CSV file, but it is taking a huge time. I put the code on run mode at night and I slept, it processed only 4500 files, but still morning it is running.
There is any way to fast way to convert the text files into csv?
Here is my code:
import pandas as pd
import os
import glob
from tqdm import tqdm
# create empty dataframe
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
# get list of files
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
for filename in tqdm(file_list):
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in tqdm(datafile):
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
Here is my example text file.
ID 0xb379
Delivery_person_ID BANGRES18DEL02
Delivery_person_Age 34.000000
Delivery_person_Ratings 4.500000
Restaurant_latitude 12.913041
Restaurant_longitude 77.683237
Delivery_location_latitude 13.043041
Delivery_location_longitude 77.813237
Order_Date 25-03-2022
Time_Orderd 19:45
Time_Order_picked 19:50
Weather conditions Stormy
Road_traffic_density Jam
Vehicle_condition 2
Type_of_order Snack
Type_of_vehicle scooter
multiple_deliveries 1.000000
Festival No
City Metropolitian
Time_taken (min) 33.000000
CSV is a very simple data format for which you don't need any sophisticated tools to handle. Just text and separators.
In your hopefully simple case there is no need to use pandas and dictionaries.
Except your datafiles are corrupt missing some columns or having some additional columns to skip. But even in this case you can handle such issues better within your own code so you have more control over it and are able to get results within seconds.
Assuming your datafiles are not corrupt having all columns in the right order with no missing columns or having additional ones (so you can rely on their proper formatting), just try this code:
from time import perf_counter as T
sT = T()
filesProcessed = 0
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
import glob, os
file_list = glob.glob(os.path.join(os.getcwd(), "train/", "*.txt"))
csv_lines = []
csv_line_counter = 0
for filename in file_list:
filesProcessed += 1
with open(filename) as datafile:
csv_line = ""
for line in datafile.read().splitlines():
# print(line)
var = line.partition(" ")[-1]
csv_line += var.strip() + ';'
csv_lines.append(str(csv_line_counter)+';'+csv_line[:-1])
csv_line_counter += 1
with open("train.csv", "w") as csvfile:
csvfile.write(';'+';'.join(columns)+'\n')
csvfile.write('\n'.join(csv_lines))
eT = T()
print(f'> {filesProcessed=}, {(eT-sT)=:8.6f}')
I guess you will get the result in a speed beyond your expectations (in seconds, not minutes or hours)
On my computer, estimating from processing time of 100 files the time required for 50.000 files will be about 3 seconds.
I could not replicate. I took the example data file and created 5000 copies of it. Then I ran your code using tqdm and without. The below shows without:
import time
import csv
import os
import glob
import pandas as pd
from tqdm import tqdm
csvout = pd.DataFrame(columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"])
file_list = glob.glob(os.path.join(os.getcwd(), "sample_files/", "*.txt"))
t1 = time.time()
for filename in file_list:
# next file/record
mydict = {}
with open(filename) as datafile:
# read each line and split on " " space
for line in datafile:
# Note: partition result in 3 string parts, "key", " ", "value"
# array slice third parameter [::2] means steps=+2
# so only take 1st and 3rd item
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
# put dictionary in dataframe
csvout = csvout.append(mydict, ignore_index=True)
# write to csv
csvout.to_csv("train.csv", sep=";", index=False)
t2 = time.time()
print(t2-t1)
The times I got where:
tqdm 33 seconds
no tqdm 34 seconds
Then I ran using the csv module:
t1 = time.time()
with open('output.csv', 'a', newline='') as csv_file:
columns =["ID","Delivery_person_ID" ,"Delivery_person_Age" ,"Delivery_person_Ratings","Restaurant_latitude","Restaurant_longitude","Delivery_location_latitude","Delivery_location_longitude","Order_Date","Time_Orderd","Time_Order_picked","Weather conditions","Road_traffic_density","Vehicle_condition","Type_of_order","Type_of_vehicle", "multiple_deliveries","Festival","City","Time_taken (min)"]
mydict = {}
d_Writer = csv.DictWriter(csv_file, fieldnames=columns, delimiter=',')
d_Writer.writeheader()
for filename in file_list:
with open(filename) as datafile:
for line in datafile:
name, var = line.partition(" ")[::2]
mydict[name.strip()] = var.strip()
d_Writer.writerow(mydict)
t2 = time.time()
print(t2-t1)
The time for this was:
csv 0.32231569290161133 seconds.
Try it like this.
import glob
with open('my_file.csv', 'a') as csv_file:
for path in glob.glob('./*.txt'):
with open(path) as txt_file:
txt = txt_file.read() + '\n'
csv_file.write(txt)

Attempt to populate list with multiple entries in a for loop only returns a single entry

I have a csv file with URLs I'd like to extract data from, but my script currently only manages to get the last entry to append. This is the script:
import os
import glob
import time
from urllib.request import urlopen
import pandas as pd
import xml.etree.ElementTree as ET
count=0
files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb2.csv',recursive=True) #searches all files in folder
print(files)
for file in files:
if count==0:
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
for row in csvfile['URL']:
print('row: ' + row)
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
#datafield_attribute_filters = [] # Decomment this line to clear filters (and process each datafield node)
aut = []
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if datafield_attribute_filters:
skip_node = True
for attr_dict in datafield_attribute_filters:
for k, v in attr_dict.items():
if datafield_node.get(k) != v:
break
else:
skip_node = False
break
if skip_node:
continue
for subfield_node in datafield_node.iterfind("./subfield[#code='a']", namespaces=namespaces):
aut.append(subfield_node.text) #this gets the author name and title
print(aut)
count+=1
and this is the csv file:
URL
0 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783960382850&recordSchema=MARC21-xml
1 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783963622106&recordSchema=MARC21-xml
2 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D-&recordSchema=MARC21-xml
3 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783806241280&recordSchema=MARC21-xml
4 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783890296005&recordSchema=MARC21-xml
5 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110699111&recordSchema=MARC21-xml
6 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110698930&recordSchema=MARC21-xml
7 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110699104&recordSchema=MARC21-xml
8 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783963621093&recordSchema=MARC21-xml
9 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783451716034&recordSchema=MARC21-xml
10 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9788791953514&recordSchema=MARC21-xml
When I execute the script, the output is:
['Schmidt, Horst']
but I need the other results as well. How can I do that?
Any help is appreciated.
EDIT: link to the full csv file on Pastebin the filename is: Reihe-21A51.csv_extract.csv_isbn-dnb2.csv
As #Tranbi pointed out, I had to move the aut=[] outside of the loop
it's now
for file in files:
if count==0: #to only go through the first file, instead of all files in the folder
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
aut = []
instead of
aut = []
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):

Running a Python script for files in a folder

There are 15 text files in a folder and I am trying to extract certain parts of each file and output them to a new file.
I am able to extract each file individually by just changing the file name and append each file to the output file but this means copying the same code 15 times and just changing the file name each time.
import glob,os
lst = []
filelist=glob.glob ('/C:/Users/bridaly/Documents/PythonTest/Python_Test_ENdata_3080_v20150914/input/*')
for file in filelist:
if os.path.isfile(file):
for line in filelist:
line = line.strip()
if not (
line.startswith("APPEND") or line.startswith("_") or
line.startswith("SAP") or line.startswith("~") or
line.startswith("INCLUDE") or line.startswith("ABAP")
or line.strip() == "" or line.startswith("Field") or
line.startswith("Short")
) :
y=line.replace(' ',' ')
#print(y)
z = y.replace('X','')
#print(z)
w = "|".join(z.split())
#print(w)
x = w.split("|",3)[:4]
#print(x)
x.insert(0,'./input/01BKPF')
#print(x)
if len(x) >=4:
t = [s.replace('|',' ') for s in x]
#print(t)
print("|".join(t))
lst.append("|".join(t))
#Output Script
output_file = open('Output_Final.txt', 'w')
for l in lst:
output_file.write(l)
output_file.write('\n')
output_file.close()
"""
The output should extract what's written in the code but for each file and append it to the output file. I have gotten the correct output by copying the code 15 times but I just want to use it once as it is more efficient.
files = glob.glob('path')
for file in files:
file_name = os.path.basename(file)
print(file_name)
you can iterate for each file

How to open and append nested zip archives into dataframe without extracting?

I am trying to open a large number of csv files which found in several layers of zip files. Given the nature of this project, I am trying to open, read_csv them into a dataframe, append that data to an aggregate dataframe then continue through the loop.
Example: Folder Directory/First Zip/Second Zip/Third Zip/csv file.csv
My existing code can loop through the contents of the second and third zip file and get the name of each csv file. I am aware that this code can probably be made more simple by importing glob, but I'm unfamiliar.
import os
import pandas as pd
import zipfile, re, io
directory = 'C:/Test/'
os.chdir(directory)
fname = "test" + ".zip"
with zipfile.ZipFile(fname, 'r') as zfile:
# second level of zip files
for zipname in zfile.namelist():
if re.search(r'\.zip$', zipname) != None:
zfiledata = io.BytesIO(zfile.read(zipname))
# third level of zip files
with zipfile.ZipFile(zfiledata) as zfile2:
for zipname2 in zfile2.namelist():
# this zipfile contains xml and csv contents. This filters out the xmls
if zipname2.find("csv") > 0:
zfiledata2 = io.BytesIO(zfile2.read(zipname2))
with zipfile.ZipFile(zfiledata2) as zfile3:
fullpath = directory + fname + "/" + zipname + "/" + zipname2 + "/"
# csv file names are always the same as their zips. this cleans the string.
csvf = zipname2.replace('_csv.zip',".csv")
filehandle = open(fullpath, 'rb')
# the above statement is erroring: FileNotFoundError: [Errno 2] No such file or directory:
zfilehandle = zipfile.ZipFile(filehandle)
data = []
csvdata = StringIO.StringIO(zfilehandle.read(csvf))
df = pd.read_csv(csvdata)
data.append(df)
print(data.head())

Make new txt file with size info of output and input files

The above part of the code is good, but the second part im trying to create a new txt file with information about files that are created in the first part, for example in this txt file will be written: INPUT FILE1 SIZE IS 42, OUTPUT FILE1 SIZE IS 324, than the second file: INPUT FILE2 SIZE IS 62, OUTPUT FILE1 SIZE IS 543...etc etc
import pandas as pd
import glob
import os
files = glob.glob('*.csv')
for file in files:
df = pd.read_csv(file, header= None)
df1 = df.iloc[:, :4].agg(['sum','max','std'])
df1.columns = range(1, len(df1.columns) + 1)
s = df1.stack()
L = ['{} of the {}. column is {}'.format(a, b, c) for (a, b), c in s.items()]
output_file_name = "output_" + file
pd.Series(L).to_csv(output_file_name ,index=False)#this part is good
for file in files:
with open(file + "stats.txt", 'a+') as f:
f.write(' input file size is {}'.format(os.path.getsize(file)))
f.write('output file size is {}'.format(os.path.getsize(output_file_name)))
f.close()
Use:
import glob, os
import pandas as pd
files = glob.glob('*.csv')
#loop by all files
for file in files:
L = []
#remove not starting by output_
if not file.startswith(('output_','file_size_')):
output_file_name = "output_" + file
#add both format
infile = 'SIZE OF INPUT FILE {} IS {}, '.format(file, os.path.getsize(file))
outfile = 'SIZE OF INPUT FILE {} IS {}'.format(output_file_name,
os.path.getsize(output_file_name))
#join together and append to list
L.append(infile + outfile )
#create Series and write to file
pd.Series(L).to_csv('file_size_{}'.format(file), index=False)

Resources