Can't open HDF5 file bigger than memory... ValueError - python-3.x

I have many .csv of NYC taxi from nyc.gov, one .csv = year-month. There I grab cca 15 of csvs and make HDF5s from them:
import h5py
import pandas as pd
import os
import glob
import numpy as np
import vaex
from tqdm import tqdm_notebook as tqdm
#hdf = pd.HDFStore('c:/Projekty/H5Edu/NYCTaxi/NYCTaxi.hp')
#df1 = pd.read_csv('path nejake csvcko')
#hdf.put('DF1', df1, format = 'table', data_columns = True)
csv_list = np.sort(np.array(glob.glob('G:\\NYCTaxi\\*.csv')))[::-1]
csv_list = csv_list[20:39]
output_dir = 'c:\\Datasety\\YelowTaxi\\DataH5\\'
for file in tqdm(csv_list, leave=False, desc='Converting to hdf5...'):
# Setting up the files, and directories
#zip_file = ZipFile(file)
output_file = file.split('\\')[-1][:-3]+'hdf5'
output = output_dir + output_file
#output = output_file
# Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it
if (os.path.exists(output) and os.path.isfile(output)):
pass
else:
# Importing the data into pandas
#pandas_df = [pd.read_csv(file, index_col=None, header=0)][0]
pandas_df = [pd.read_csv(file, index_col=None, header=0, low_memory=False)][0]
# Rename some columns to match the more well known dataset from
# http://stat-computing.org/dataexpo/2009/the-data.html
# Importing the data from pandas to vaex
vaex_df = vaex.from_pandas(pandas_df, copy_index=False)
# Export the data with vaex to hdf5
vaex_df.export_hdf5(path=output, progress=False)
Next I make one big HDF5:
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('c:\\Datasety\\YelowTaxi\\DataH5\\*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
#assert len(hdf5_list) == 3, "Incorrect number of files"
# This is an important step
master_df = vaex.open_many(hdf5_list)
# exporting
#master_df.export_hdf5(path='c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hd5', progress=True)
master_df.export_hdf5(path='c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hdf5', progress=True)
So far, everything is ok, I can open output file Spojene.hdf5.
Next, I append new .csv to Spojene.hdf5:
for file in csv_list:
#file = csv_list[0]
df2 = pd.read_csv(file, index_col=None, header=0, low_memory=False)
filename = 'c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hdf5'
df2.to_hdf(filename, 'data', append=True)
But, when I append new .csv to Spojene.hdf5, I cant open it:
df = vaex.open('c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hdf5')
ValueError: First columns has length 289184484, while column table has length 60107988
Pls, what can I do?

I think this is linked to how pandas is creating hdf5 files. According to vaex's documentation you can't open a HDF5 file with vaex if it has been created via to_hdf pandas method. I assume it is the same if you append to an existing HDF5 file.
To avoid this error you can reuse your logic where you convert the pandas dataframe to a vaex dataframe, export it to HDF5 and then use open_many. Something like this should work:
main_hdf5_file_path = "c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hdf5"
hdf5_files_created = []
for file in csv_list:
hdf5_file = file.replace(".csv", ".hdf5")
# from_csv can take additional parameters to forward to pd.read_csv
# You can also use convert=True to convert it automatically to hdf5 without the export_hdf5
# Refer to https://vaex.readthedocs.io/en/docs/api.html#vaex.from_csv
df = vaex.from_csv(file)
df.export_hdf5(hdf5_file)
hdf5_files_created.append(hdf5_file)
hdf5_to_read = hdf5_files_created + [main_hdf5_file_path]
final_df = vaex.open_many(hdf5_to_read)
final_df.export_hdf5(main_hdf5_file_path)

Related

Combining CSV files using Pandas is appending additional columns rather than right below?

I'm not exactly sure what is the best way to describe this problem, but the photos below should be pretty clear.
First photo is the current output and the second photo is the desired output.
Here is the code I'm using to combine these files:
import os
import glob
import pandas as pd
os.chdir("mydir")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files inthe list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ], axis=1)
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
I've also used this with no luck:
import pandas as pd
import glob
import os
# merging the files
joined_files = os.path.join("mydir", "clean_csv*.csv")
# A list of all joined files is returned
joined_list = glob.glob(joined_files)
# Finally, the files are joined
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
df.to_csv('output-test.csv', index=True, encoding='utf-8-sig', header=None)

Pandas Add filename to a Excel combined file

I have a code that combines multiple excel files in 1 file, but I need to add a column with the name of the file used (filename).
Here is the code:
import os
import pandas as pd
cwd = os.path.abspath('')
files = os.listdir(cwd)
## Code gets the first sheet of a given file
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
df = df.append(pd.read_excel(file), ignore_index=True)
df.head()
df.to_excel('Combined.xlsx')
How do I do to add a column with the filename for each file used?
Thanks
Just add d["filename"] = file When you load Excel file in for loop:
import os
import pandas as pd
cwd = os.path.abspath('')
files = os.listdir(cwd)
## Code gets the first sheet of a given file
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
d = pd.read_excel(file)
d["filename"] = file
df = df.append(d, ignore_index=True)
df.head()
df.to_excel('Combined.xlsx')
Try this one.
import os
import pandas as pd
cwd = os.path.abspath('')
files = os.listdir(cwd)
## Code gets the first sheet of a given file
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
df = df.append([file]) # Here is the code to ADD filename
df = df.append(pd.read_excel(file), ignore_index=True)
df.head()
df.to_excel('Combined.xlsx')
Create a dict to collect your dataframes then combine them before exporting (and use pathlib instead of os module):
import pathlib
import pandas as pd
data = {}
for file in pathlib.Path().glob('*.xlsx'):
data[file.name] = pd.read_excel(file)
pd.concat(data).to_excel('Combined.xlsx')
Note: if you want to get the filename without extension, use file.stem rather than file.name.
Update
What about when the excel files to combine have more than 1 Sheet?
import pathlib
import pandas as pd
data = []
names = []
for file in pathlib.Path().glob('?.xlsx'):
for df in pd.read_excel(file, sheet_name=None).values():
names.append(file.name)
data.append(df)
pd.concat(data, keys=names).to_excel('Combined.xlsx')

Import dataset from url and convert text to csv in python3

I am pretty new to Python (using Python3) and read Pandas to import dataset.
I need to import dataset from url - https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/leukemia_remission/index.txt
and convert it to csv file, I am getting some special character in converted csv -> ��
I am download txt file and converting it to csv, is is the right approach?
and converted csv is putting entire text into one column
from urllib.request import urlretrieve
import pandas as pd
from pandas import DataFrame
url = 'https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/leukemia_remission/index.txt'
urlretrieve(url, 'index.txt')
df = pd.read_csv('index.txt', sep='/t', engine='python', lineterminator='\r\n')
csv_file = df.to_csv('index.csv', sep='\t', index=False, header=True)
print(csv_file)
after successful import, I have to Extract X as all columns except the first column and Y as first column also.
I'll appreciate your all help.
from urllib.request import urlretrieve
import pandas as pd
url = 'https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/leukemia_remission/index.txt'
urlretrieve(url, 'index.txt')
df = pd.read_csv('index.txt', sep='\t',encoding='utf-16')
Y = df[['REMISS']]
X = df.drop(['REMISS'],axis=1)

How to Read multiple files in Python for Pandas separate dataframes

I am trying to read 6 files into 7 different data frames but I am unable to figure out how should I do that. File names can be complete random, that is I know the files but it is not like data1.csv data2.csv.
I tried using something like this:
import sys
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
f1='Norway.csv'
f='Canada.csv'
f='Chile.csv'
Norway = pd.read_csv(Norway.csv)
Canada = pd.read_csv(Canada.csv)
Chile = pd.read_csv(Chile.csv )
I need to read multiple files in different dataframes. it is working fine when I do with One file like
file='Norway.csv
Norway = pd.read_csv(file)
And I am getting error :
NameError: name 'norway' is not defined
You can read all the .csv file into one single dataframe.
for file_ in all_files:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
# concatenate all dfs into one
big_df = pd.concat(dfs, ignore_index=True)
and then split the large dataframe into multiple (in your case 7). For example, -
import numpy as np
num_chunks = 3
df1,df2,df3 = np.array_split(big_df,num_chunks)
Hope this helps.
After googling for a while looking for an answer, I decided to combine answers from different questions into a solution to this question. This solution will not work for all possible cases. You have to tweak it to meet all your cases.
check out the solution to this question
# import libraries
import pandas as pd
import numpy as np
import glob
import os
# Declare a function for extracting a string between two characters
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
path = '/path/to/folder/containing/your/data/sets' # use your path
all_files = glob.glob(path + "/*.csv")
list_of_dfs = [pd.read_csv(filename, encoding = "ISO-8859-1") for filename in all_files]
list_of_filenames = [find_between(filename, 'sets/', '.csv') for filename in all_files] # sets is the last word in your path
# Create a dictionary with table names as the keys and data frames as the values
dfnames_and_dfvalues = dict(zip(list_of_filenames, list_of_dfs))

Combine multiple csv files into a single xls workbook Python 3

We are in the transition at work from python 2.7 to python 3.5. It's a company wide change and most of our current scripts were written in 2.7 and no additional libraries. I've taken advantage of the Anaconda distro we are using and have already change most of our scripts over using the 2to3 module or completely rewriting them. I am stuck on one piece of code though, which I did not write and the original author is not here. He also did not supply comments so I can only guess at the whole of the script. 95% of the script works correctly until the end where after it creates 7 csv files with different parsed information it has a custom function to combine the csv files into and xls workbook with each csv as new tab.
import csv
import xlwt
import glob
import openpyxl
from openpyxl import Workbook
Parsefiles = glob.glob(directory + '/' + "Parsed*.csv")
def xlsmaker():
for f in Parsefiles:
(path, name) = os.path.split(f)
(chort_name, extension) = os.path.splittext(name)
ws = wb.add_sheet(short_name)
xreader = csv.reader(open(f, 'rb'))
newdata = [line for line in xreader]
for rowx, row in enumerate(newdata)
for colx, value in enumerate(row):
if value.isdigit():
ws.write(rowx, colx, value)
xlsmaker()
for f in Parsefiles:
os.remove(f)
wb.save(directory + '/' + "Finished" + '' + oshort + '' + timestr + ".xls")
This was written all in python 2.7 and still works correctly if I run it in python 2.7. The issue is that it throws an error when running in python 3.5.
File "parsetool.py", line 521, in (module)
xlsmaker()
File "parsetool.py", line 511, in xlsmaker
ws = wb.add_sheet(short_name)
File "c:\pythonscripts\workbook.py", line 168 in add_sheet
raise TypeError("The paramete you have given is not of the type '%s'"% self._worksheet_class.__name__)
TypeError: The parameter you have given is not of the type "Worksheet"
Any ideas about what should be done to fix the above error? Iv'e tried multiple rewrites, but I get similar errors or new errors. I'm considering just figuring our a whole new method to create the xls, possibly pandas instead.
Not sure why it errs. It is worth the effort to rewrite the code and use pandas instead. Pandas can read each csv file into a separate dataframe and save all dataframes as a separate sheet in an xls(x) file. This can be done by using the ExcelWriter of pandas. E.g.
import pandas as pd
writer = pd.ExcelWriter('yourfile.xlsx', engine='xlsxwriter')
df = pd.read_csv('originalfile.csv')
df.to_excel(writer, sheet_name='sheetname')
writer.save()
Since you have multiple csv files, you would probably want to read all csv files and store them as a df in a dict. Then write each df to Excel with a new sheet name.
Multi-csv Example:
import pandas as pd
import sys
import os
writer = pd.ExcelWriter('default.xlsx') # Arbitrary output name
for csvfilename in sys.argv[1:]:
df = pd.read_csv(csvfilename)
df.to_excel(writer,sheet_name=os.path.splitext(csvfilename)[0])
writer.save()
(Note that it may be necessary to pip install openpyxl to resolve errors with xlsxwriter import missing.)
You can use the code below, to read multiple .csv files into one big .xlsx Excel file.
I also added the code for replacing ',' by '.' (or vice versa) for improved compatibility on windows environments and according to your locale settings.
import pandas as pd
import sys
import os
import glob
from pathlib import Path
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
writer = pd.ExcelWriter('fc15.xlsx') # Arbitrary output name
for csvfilename in all_filenames:
txt = Path(csvfilename).read_text()
txt = txt.replace(',', '.')
text_file = open(csvfilename, "w")
text_file.write(txt)
text_file.close()
print("Loading "+ csvfilename)
df= pd.read_csv(csvfilename,sep=';', encoding='utf-8')
df.to_excel(writer,sheet_name=os.path.splitext(csvfilename)[0])
print("done")
writer.save()
print("task completed")
Here's a slight extension to the accepted answer. Pandas 1.5 complains about the call to writer.save(). The fix is to use the writer as a context manager.
import sys
from pathlib import Path
import pandas as pd
with pd.ExcelWriter("default.xlsx") as writer:
for csvfilename in sys.argv[1:]:
p = Path(csvfilename)
sheet_name = p.stem[:31]
df = pd.read_csv(p)
df.to_excel(writer, sheet_name=sheet_name)
This version also trims the sheet name down to fit in Excel's maximum sheet name length, which is 31 characters.
If your csv file is in Chinese with gbk encoding, you can use the following code
import pandas as pd
import glob
import datetime
from pathlib import Path
now = datetime.datetime.now()
extension = "csv"
all_filenames = [i for i in glob.glob(f"*.{extension}")]
with pd.ExcelWriter(f"{now:%Y%m%d}.xlsx") as writer:
for csvfilename in all_filenames:
print("Loading " + csvfilename)
df = pd.read_csv(csvfilename, encoding="gb18030")
df.to_excel(writer, index=False, sheet_name=Path(csvfilename).stem)
print("done")
print("task completed")

Resources