Pandas Add filename to a Excel combined file - excel

I have a code that combines multiple excel files in 1 file, but I need to add a column with the name of the file used (filename).
Here is the code:
import os
import pandas as pd
cwd = os.path.abspath('')
files = os.listdir(cwd)
## Code gets the first sheet of a given file
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
df = df.append(pd.read_excel(file), ignore_index=True)
df.head()
df.to_excel('Combined.xlsx')
How do I do to add a column with the filename for each file used?
Thanks

Just add d["filename"] = file When you load Excel file in for loop:
import os
import pandas as pd
cwd = os.path.abspath('')
files = os.listdir(cwd)
## Code gets the first sheet of a given file
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
d = pd.read_excel(file)
d["filename"] = file
df = df.append(d, ignore_index=True)
df.head()
df.to_excel('Combined.xlsx')

Try this one.
import os
import pandas as pd
cwd = os.path.abspath('')
files = os.listdir(cwd)
## Code gets the first sheet of a given file
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
df = df.append([file]) # Here is the code to ADD filename
df = df.append(pd.read_excel(file), ignore_index=True)
df.head()
df.to_excel('Combined.xlsx')

Create a dict to collect your dataframes then combine them before exporting (and use pathlib instead of os module):
import pathlib
import pandas as pd
data = {}
for file in pathlib.Path().glob('*.xlsx'):
data[file.name] = pd.read_excel(file)
pd.concat(data).to_excel('Combined.xlsx')
Note: if you want to get the filename without extension, use file.stem rather than file.name.
Update
What about when the excel files to combine have more than 1 Sheet?
import pathlib
import pandas as pd
data = []
names = []
for file in pathlib.Path().glob('?.xlsx'):
for df in pd.read_excel(file, sheet_name=None).values():
names.append(file.name)
data.append(df)
pd.concat(data, keys=names).to_excel('Combined.xlsx')

Related

Combining CSV files using Pandas is appending additional columns rather than right below?

I'm not exactly sure what is the best way to describe this problem, but the photos below should be pretty clear.
First photo is the current output and the second photo is the desired output.
Here is the code I'm using to combine these files:
import os
import glob
import pandas as pd
os.chdir("mydir")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files inthe list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ], axis=1)
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
I've also used this with no luck:
import pandas as pd
import glob
import os
# merging the files
joined_files = os.path.join("mydir", "clean_csv*.csv")
# A list of all joined files is returned
joined_list = glob.glob(joined_files)
# Finally, the files are joined
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
df.to_csv('output-test.csv', index=True, encoding='utf-8-sig', header=None)

Read all excel files from a directory into dataframes and add a column using the file name

I have a folder that contains some excel files. I want to read them all to a dataframe, but at the same time adding a date column. The date is contained in each file name. I have the code to read the file content, but not sure how to read the date from file names.
This is my code to read the files.
import pandas as pd
import glob
all_data = pd.DataFrame()
for f in glob.glob('my directory/*.xlsx'):
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
I assume I need to add df['date']= but not sure how to get the date from the file names. All files names have the same format with the date at the end of the name. For example, 'Data report 06.08.21'.
Thanks very much for your help.
import pandas as pd
import glob, os, re
all_data = pd.DataFrame()
for f in glob.glob('my directory/*.xlsx'):
_, f_file= os.path.split(f)
file_date=re.findall(r'\d{2}\.\d{2}\.\d{2}', f_file)
df = pd.read_excel(f)
df['date'] =str(file_date)
all_data = all_data.append(df,ignore_index=True)

Can't open HDF5 file bigger than memory... ValueError

I have many .csv of NYC taxi from nyc.gov, one .csv = year-month. There I grab cca 15 of csvs and make HDF5s from them:
import h5py
import pandas as pd
import os
import glob
import numpy as np
import vaex
from tqdm import tqdm_notebook as tqdm
#hdf = pd.HDFStore('c:/Projekty/H5Edu/NYCTaxi/NYCTaxi.hp')
#df1 = pd.read_csv('path nejake csvcko')
#hdf.put('DF1', df1, format = 'table', data_columns = True)
csv_list = np.sort(np.array(glob.glob('G:\\NYCTaxi\\*.csv')))[::-1]
csv_list = csv_list[20:39]
output_dir = 'c:\\Datasety\\YelowTaxi\\DataH5\\'
for file in tqdm(csv_list, leave=False, desc='Converting to hdf5...'):
# Setting up the files, and directories
#zip_file = ZipFile(file)
output_file = file.split('\\')[-1][:-3]+'hdf5'
output = output_dir + output_file
#output = output_file
# Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it
if (os.path.exists(output) and os.path.isfile(output)):
pass
else:
# Importing the data into pandas
#pandas_df = [pd.read_csv(file, index_col=None, header=0)][0]
pandas_df = [pd.read_csv(file, index_col=None, header=0, low_memory=False)][0]
# Rename some columns to match the more well known dataset from
# http://stat-computing.org/dataexpo/2009/the-data.html
# Importing the data from pandas to vaex
vaex_df = vaex.from_pandas(pandas_df, copy_index=False)
# Export the data with vaex to hdf5
vaex_df.export_hdf5(path=output, progress=False)
Next I make one big HDF5:
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('c:\\Datasety\\YelowTaxi\\DataH5\\*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
#assert len(hdf5_list) == 3, "Incorrect number of files"
# This is an important step
master_df = vaex.open_many(hdf5_list)
# exporting
#master_df.export_hdf5(path='c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hd5', progress=True)
master_df.export_hdf5(path='c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hdf5', progress=True)
So far, everything is ok, I can open output file Spojene.hdf5.
Next, I append new .csv to Spojene.hdf5:
for file in csv_list:
#file = csv_list[0]
df2 = pd.read_csv(file, index_col=None, header=0, low_memory=False)
filename = 'c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hdf5'
df2.to_hdf(filename, 'data', append=True)
But, when I append new .csv to Spojene.hdf5, I cant open it:
df = vaex.open('c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hdf5')
ValueError: First columns has length 289184484, while column table has length 60107988
Pls, what can I do?
I think this is linked to how pandas is creating hdf5 files. According to vaex's documentation you can't open a HDF5 file with vaex if it has been created via to_hdf pandas method. I assume it is the same if you append to an existing HDF5 file.
To avoid this error you can reuse your logic where you convert the pandas dataframe to a vaex dataframe, export it to HDF5 and then use open_many. Something like this should work:
main_hdf5_file_path = "c:\\Datasety\\YelowTaxi\\DataH5\\Spojene.hdf5"
hdf5_files_created = []
for file in csv_list:
hdf5_file = file.replace(".csv", ".hdf5")
# from_csv can take additional parameters to forward to pd.read_csv
# You can also use convert=True to convert it automatically to hdf5 without the export_hdf5
# Refer to https://vaex.readthedocs.io/en/docs/api.html#vaex.from_csv
df = vaex.from_csv(file)
df.export_hdf5(hdf5_file)
hdf5_files_created.append(hdf5_file)
hdf5_to_read = hdf5_files_created + [main_hdf5_file_path]
final_df = vaex.open_many(hdf5_to_read)
final_df.export_hdf5(main_hdf5_file_path)

Python CSV merge issue

New to python and I am presently in the process of CSV merge using Python 3.7.
import pandas as pd
import os
newdir = 'C:\\xxxx\\xxxx\\xxxx\\xxxx'
list = os.listdir(newdir)
writer = pd.ExcelWriter('test.xlsx')
for i in range(0,len(list)):
data = pd.read_csv(list[i],encoding="gbk", index_col=0)
data.to_excel(writer, sheet_name=list[i])
writer.save()
I try to result as below:
FileNotFoundError: [Errno 2] File b'a.csv' does not exist: b'a.csv'
The problem is all of not csv merge into one xlsx file. Please let me know solution.
os.listdir only returns the filenames. You'll need to prepend the folder name to the filename.
import pandas as pd
import os
newdir = 'C:\\xxxx\\xxxx\\xxxx\\xxxx'
names = os.listdir(newdir)
writer = pd.ExcelWriter('test.xlsx')
for name in names:
path = os.path.join(newdir, name)
data = pd.read_csv(path, encoding="gbk", index_col=0)
data.to_excel(writer, sheet_name=name)
writer.save()
Note that I did not bother to check the rest of your code.
Oh and please avoid using builtins to name your variables.

Combine multiple csv files into a single xls workbook Python 3

We are in the transition at work from python 2.7 to python 3.5. It's a company wide change and most of our current scripts were written in 2.7 and no additional libraries. I've taken advantage of the Anaconda distro we are using and have already change most of our scripts over using the 2to3 module or completely rewriting them. I am stuck on one piece of code though, which I did not write and the original author is not here. He also did not supply comments so I can only guess at the whole of the script. 95% of the script works correctly until the end where after it creates 7 csv files with different parsed information it has a custom function to combine the csv files into and xls workbook with each csv as new tab.
import csv
import xlwt
import glob
import openpyxl
from openpyxl import Workbook
Parsefiles = glob.glob(directory + '/' + "Parsed*.csv")
def xlsmaker():
for f in Parsefiles:
(path, name) = os.path.split(f)
(chort_name, extension) = os.path.splittext(name)
ws = wb.add_sheet(short_name)
xreader = csv.reader(open(f, 'rb'))
newdata = [line for line in xreader]
for rowx, row in enumerate(newdata)
for colx, value in enumerate(row):
if value.isdigit():
ws.write(rowx, colx, value)
xlsmaker()
for f in Parsefiles:
os.remove(f)
wb.save(directory + '/' + "Finished" + '' + oshort + '' + timestr + ".xls")
This was written all in python 2.7 and still works correctly if I run it in python 2.7. The issue is that it throws an error when running in python 3.5.
File "parsetool.py", line 521, in (module)
xlsmaker()
File "parsetool.py", line 511, in xlsmaker
ws = wb.add_sheet(short_name)
File "c:\pythonscripts\workbook.py", line 168 in add_sheet
raise TypeError("The paramete you have given is not of the type '%s'"% self._worksheet_class.__name__)
TypeError: The parameter you have given is not of the type "Worksheet"
Any ideas about what should be done to fix the above error? Iv'e tried multiple rewrites, but I get similar errors or new errors. I'm considering just figuring our a whole new method to create the xls, possibly pandas instead.
Not sure why it errs. It is worth the effort to rewrite the code and use pandas instead. Pandas can read each csv file into a separate dataframe and save all dataframes as a separate sheet in an xls(x) file. This can be done by using the ExcelWriter of pandas. E.g.
import pandas as pd
writer = pd.ExcelWriter('yourfile.xlsx', engine='xlsxwriter')
df = pd.read_csv('originalfile.csv')
df.to_excel(writer, sheet_name='sheetname')
writer.save()
Since you have multiple csv files, you would probably want to read all csv files and store them as a df in a dict. Then write each df to Excel with a new sheet name.
Multi-csv Example:
import pandas as pd
import sys
import os
writer = pd.ExcelWriter('default.xlsx') # Arbitrary output name
for csvfilename in sys.argv[1:]:
df = pd.read_csv(csvfilename)
df.to_excel(writer,sheet_name=os.path.splitext(csvfilename)[0])
writer.save()
(Note that it may be necessary to pip install openpyxl to resolve errors with xlsxwriter import missing.)
You can use the code below, to read multiple .csv files into one big .xlsx Excel file.
I also added the code for replacing ',' by '.' (or vice versa) for improved compatibility on windows environments and according to your locale settings.
import pandas as pd
import sys
import os
import glob
from pathlib import Path
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
writer = pd.ExcelWriter('fc15.xlsx') # Arbitrary output name
for csvfilename in all_filenames:
txt = Path(csvfilename).read_text()
txt = txt.replace(',', '.')
text_file = open(csvfilename, "w")
text_file.write(txt)
text_file.close()
print("Loading "+ csvfilename)
df= pd.read_csv(csvfilename,sep=';', encoding='utf-8')
df.to_excel(writer,sheet_name=os.path.splitext(csvfilename)[0])
print("done")
writer.save()
print("task completed")
Here's a slight extension to the accepted answer. Pandas 1.5 complains about the call to writer.save(). The fix is to use the writer as a context manager.
import sys
from pathlib import Path
import pandas as pd
with pd.ExcelWriter("default.xlsx") as writer:
for csvfilename in sys.argv[1:]:
p = Path(csvfilename)
sheet_name = p.stem[:31]
df = pd.read_csv(p)
df.to_excel(writer, sheet_name=sheet_name)
This version also trims the sheet name down to fit in Excel's maximum sheet name length, which is 31 characters.
If your csv file is in Chinese with gbk encoding, you can use the following code
import pandas as pd
import glob
import datetime
from pathlib import Path
now = datetime.datetime.now()
extension = "csv"
all_filenames = [i for i in glob.glob(f"*.{extension}")]
with pd.ExcelWriter(f"{now:%Y%m%d}.xlsx") as writer:
for csvfilename in all_filenames:
print("Loading " + csvfilename)
df = pd.read_csv(csvfilename, encoding="gb18030")
df.to_excel(writer, index=False, sheet_name=Path(csvfilename).stem)
print("done")
print("task completed")

Resources