In python, how to concatenate corresponding sheets in multiple excel files - excel
How do I concatenate multiple xlsx files with the same sheet_names. For example,
I have 3 xlsx files, Rob_schedule.xlsx, Mike_schdule.xlsx and Jerome_schedule.xlsx.
Each file has the following sheet/tab names : home, office & school.
The code below generates the 3 xlsx files ( you can copy + paste and run to generate the excel files)
##############################Generating the data for Rob_schedule.xlsx########################
import pandas as pd
import numpy as np
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['AP_Bio','AP_Chem','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ6','MQ2','MQ8'],
'Lunch':[1,1,1,3],
'code':['java','python','C','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['C','B','D','B'],
'Laundry':['color','white','White','color'],
'cleaning':['balcony','garage','restroom','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
import pandas as pd
#initialze the excel writer
writer = pd.ExcelWriter('Rob_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items():
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
################################ generating Mike_schedule.xlsx###################################
import pandas as pd
import numpy as np
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['AP_Bio','AP_Chem','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ2','MQ4','MQ5'],
'Lunch':[1,1,1,3],
'code':['javascript','R','C','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['A','B','D','B'],
'Laundry':['color','white','white','color'],
'cleaning':['patio','garage','living_room','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
#initialze the excel writer
writer = pd.ExcelWriter('Mike_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items(): # .use .items for python 3.X
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
######################### Generate Jerome schedule###########################################
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['French','Math','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ2','MQ4','MQ5'],
'Lunch':[1,1,1,3],
'code':['javascript','python','R','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['X','B','D','C'],
'Laundry':['color','white','white','color'],
'cleaning':['patio','garage','living_room','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
import pandas as pd
#initialze the excel writer
writer = pd.ExcelWriter('Jerome_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items(): # .use .items for python 3.X
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
I want to
concatenate the corresponding sheets/tabs :home, office, and school for Rob_schedule.xlsx,Mike_schedule.xlsx & Jerome_schedule.xlsx
export the concatenated dataframes as family_schedule.xlsx with home, office and school tabs
My attempt:
# This code concatenates all the tabs into one tab, but what I want is to concatenate all by their corresponding sheet/tab names
import pandas as pd
path = os.chdir(r'mypath\\')
files = os.listdir(path)
files
# pull files with `.xlsx` extension
excel_files = [file for file in files if '.xlsx' in file]
excel_files
def create_df_from_excel(file_name):
file = pd.ExcelFile(file_name)
names = file.sheet_names
return pd.concat([file.parse(name) for name in names])
df = pd.concat(
[create_df_from_excel(xl) for xl in excel_files]
)
# save the data frame
writer = pd.ExcelWriter('family_reschedule.xlsx')
df.to_excel(writer, '')
writer.save()
I would iterate over each file, and then over each worksheet, adding each sheet to a different list based on the sheet name.
Then you'll have a structure like...
{
'sheet1': [df_file1_sheet1, df_file2_sheet1, df_file3_sheet1],
'sheet2': [df_file1_sheet2, df_file2_sheet2, df_file3_sheet2],
'sheet3': [df_file1_sheet3, df_file2_sheet3, df_file3_sheet3],
}
Then concatenate each list in to a single dataframe, them write the three dataframes to an excel file.
# This part is just your own code, I've added it here because you
# couldn't figure out where `excel_files` came from
#################################################################
import os
import pandas as pd
path = os.chdir(r'mypath\\')
files = os.listdir(path)
files
# pull files with `.xlsx` extension
excel_files = [file for file in files if '.xlsx' in file]
excel_files
# This part is my actual answer
###############################
from collections import defaultdict
worksheet_lists = defaultdict(list)
for file_name in excel_files:
workbook = pd.ExcelFile(file_name)
for sheet_name in workbook.sheet_names:
worksheet = workbook.parse(sheet_name)
worksheet['source'] = file_name
worksheet_lists[sheet_name].append(worksheet)
worksheets = {
sheet_name: pd.concat(sheet_list)
for (sheet_name, sheet_list)
in worksheet_lists.items()
}
writer = pd.ExcelWriter('family_reschedule.xlsx')
for sheet_name, df in worksheets.items():
df.to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
Consider building a list of concatenated data frames with list/dict comprehensions by running an outer iteration across sheet names and inner iteration across workbooks:
import pandas as pd
path = "/path/to/workbooks"
workbooks = [f for f in os.listdir(path) if f.endswith(".xlsx")]
sheets = ["home", "office", "school"]
df_dicts = {
sh: pd.concat(
[pd.read_excel(os.path.join(path, wb), sheet_name=sh)
for wb in workbooks]
)
for sh in sheets
}
Then, export to single file:
with pd.ExcelWriter('family_reschedule.xlsx') as writer:
for sh, df in df_dict.items():
df.to_excel(writer, sheet_name=sh, index=False)
writer.save()
Related
Append values to Dataframe in loop and if conditions
Need help please. I have a dataframe that reads rows from Excel and appends to Dataframe if certain columns exist. I need to add an additional Dataframe if the columns don't exist in a sheet and append filename and sheetname and write all the file names and sheet names for those sheets to an excel file. Also I want the values to be unique. I tried adding to dfErrorList but it only showed the last sheetname and filename and repeated itself many times in the output excel file from xlsxwriter import Workbook import pandas as pd import openpyxl import glob import os path = 'filestoimport/*.xlsx' list_of_dfs = [] list_of_dferror = [] dfErrorList = pd.DataFrame() #create empty df for filepath in glob.glob(path): xl = pd.ExcelFile(filepath) # Define an empty list to store individual DataFrames for sheet_name in xl.sheet_names: df = pd.read_excel(filepath, sheet_name=sheet_name) df['sheetname'] = sheet_name file_name = os.path.basename(filepath) df['sourcefilename'] = file_name if "Project ID" in df.columns and "Status" in df.columns: print('') *else: dfErrorList['sheetname'] = df['sheetname'] # adds `sheet_name` into the column dfErrorList['sourcefilename'] = df['sourcefilename'] continue list_of_dferror.append((dfErrorList)) df['Status'].fillna('', inplace=True) df['Added by'].fillna('', inplace=True) list_of_dfs.append(df) # # Combine all DataFrames into one data = pd.concat(list_of_dfs, ignore_index=True) dataErrors = pd.concat(list_of_dferror, ignore_index=True) dataErrors.to_excel(r'error.xlsx', index=False) # data.to_excel("total_countries.xlsx", index=None)
Storing outputdata in CSV using python
I have extracted data from different excel sheets spread in different folders, I have organized the folders numerically from 2015 to 2019 and each folder has twelve subfolders (from 1 to 12) here's my code: import os from os import walk import pandas as pd path = r'C:\Users\Sarah\Desktop\IOMTest' my_files = [] for (dirpath, dirnames, filenames) in walk(path): my_files.extend([os.path.join(dirpath, fname) for fname in filenames]) all_sheets = [] for file_name in my_files: #Display sheets names using pandas pd.set_option('display.width',300) mosul_file = file_name xl = pd.ExcelFile(mosul_file) mosul_df = xl.parse(0, header=[1], index_col=[0,1,2]) #Read Excel and Select columns mosul_file = pd.read_excel(file_name, sheet_name = 0 , index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" ) #Remove NaN values data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce') data_mosul_df = mosul_file.dropna() print(data_mosul_df) then I saved the extracted columns in a csv file def save_frames(frames, output_path): for frame in frames: frame.to_csv(output_path, mode='a+', header=False) if __name__ == '__main__': frames =[pd.DataFrame(data_mosul_df)] save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv') My problem is that when I open the csv file it seems that it doesn't store all the data but only the last excel sheet that it has read or sometimes the two last excel sheets. however, when I print my data inside the console (in Spyder) I see that all the data are treated data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce') data_mosul_df = mosul_file.dropna() print(data_mosul_df) the picture below shows the output csv created. I am wondering if it is because from Column A to Column E the information are the same ? so that's why it overwrite ? I would like to know how to modify the code so that it extract and store the data chronologically from folders (2015 to 2019) taking into accout subfolders (from 1 to 12) in each folder and how to create a csv that stores all the data ? thank you
Rewrite your loop: for file_name in my_files: #Display sheets names using pandas pd.set_option('display.width',300) mosul_file = file_name xl = pd.ExcelFile(mosul_file) mosul_df = xl.parse(0, header=[1], index_col=[0,1,2]) #Read Excel and Select columns mosul_file = pd.read_excel(file_name, sheet_name = 0 , index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" ) #Remove NaN values data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce') data_mosul_df = mosul_file.dropna() #Make a list of df's all_sheets.append(data_mosul_df) Rewrite your save_frames: def save_frames(frames, output_path): frames.to_csv(output_path, mode='a+', header=False) Rewrite your main: if __name__ == '__main__': frames = pd.concat(all_sheets) save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')
Using xlsxwriter (or other packages) to create Excel tabs with specific naming, and write dataframe to the corresponding tab
I am trying to query based on different criteria, and then create individual tabs in Excel to store the query results. For example, I want to query all the results that match criteria A, and write the result to an Excel tab named "A". The query result is stored in the panda data frame format. My problem is, when I want to perform 4 different queries based on criteria "A", "B", "C", "D", the final Excel file only contains one tab, which corresponds to the last criteria in the list. It seems that all the previous tabs are over-written. Here is sample code where I replace the SQL query part with a pre-set dataframe and the tab name is set to 0, 1, 2, 3 ... instead of the default Sheet1, Sheet2... in Excel. import pandas as pd import xlsxwriter import datetime def GCF_Refresh(fileCreatePath, inputName): currentDT = str(datetime.datetime.now()) currentDT = currentDT[0:10] loadExcelName = currentDT + '_' + inputName + '_Load_File' fileCreatePath = fileCreatePath +'\\' + loadExcelName+'.xlsx' wb = xlsxwriter.Workbook(fileCreatePath) data = [['tom'], ['nick'], ['juli']] # Create the pandas DataFrame df = pd.DataFrame(data, columns=['Name']) writer = pd.ExcelWriter(fileCreatePath, engine='xlsxwriter') for iCount in range(5): #worksheet = writer.sheets[str(iCount)] #worksheet.write(0, 0, 'Name') df['Name'].to_excel(fileCreatePath, sheet_name=str(iCount), startcol=0, startrow=1, header=None, index=False) writer.save() writer.close() # Change the file path here to store on your local computer GCF_Refresh("H:\\", "Bulk_Load") My goal for this sample code is to have 5 tabs named, 0, 1, 2, 3, 4 and each tab has 'tom', 'nick' and 'juli' printed to it. Right now, I just have one tab (named 4), which is the last tab among all the tabs I expected.
There are a number of errors in the code: The xlsx file is created using XlsxWriter directly and then overwritten by creating it Again in Pandas. The to_excel() method takes a reference to the writer object not the file path. The save() and close() are the same thing and shouldn't be in the loop. Here is a simplified version of your code with these issues fixes: import pandas as pd import xlsxwriter fileCreatePath = 'test.xlsx' data = [['tom'], ['nick'], ['juli']] # Create the pandas DataFrame df = pd.DataFrame(data, columns=['Name']) writer = pd.ExcelWriter(fileCreatePath, engine='xlsxwriter') for iCount in range(5): df['Name'].to_excel(writer, sheet_name=str(iCount), startcol=0, startrow=1, header=None, index=False) writer.save() Output: See Working with Python Pandas and XlsxWriter in the XlsxWriter docs for some details about getting Pandas and XlsxWriter working together.
How to read an excel file with multiple sheets using for loop in python
This is what i try from pathlib import Path loc = Path('D:\DataSciSpec\Practice\Forloopindict.xlsx') dict = pd.read_excel(loc,sheetname = None) for i in dict.keys(): print(i) I get the name of sheets Sheet4 Sheet3 Sheet2 Sheet1 I can also see the sheet content one by one for i in dict.keys(): print(dict[i].head()) But how put this data in n data frames (equal to no of sheets) and then append one to another
This will create a single dataframe (df_full) with the data from all sheets. import pandas as pd loc = r'D:\DataSciSpec\Practice\Forloopindict.xlsx' workbook = pd.read_excel(loc,sheet_name = None) df_full = pd.DataFrame() for _, sheet in workbook.items(): df_full = df_full.append(sheet) # Reset index or you'll have duplicates df_full = df_full.reset_index(drop=True)
Appending Columns from several worksheets Python
I am trying to import certain columns of data from several different sheets inside of a workbook. However, while appending it only seems to append 'q2 survey' to a new workbook. How do I get this to append properly? import sys, os import pandas as pd import xlrd import xlwt b = ['q1 survey', 'q2 survey','q3 survey'] #Sheet Names df_t = pd.DataFrame(columns=["Month","Date", "Year"]) #column Name xls = "path_to_file/R.xls" sheet=[] df_b=pd.DataFrame() pd.read_excel(xls,sheet) for sheet in b: df=pd.read_excel(xls,sheet) df.rename(columns=lambda x: x.strip().upper(), inplace=True) bill=df_b.append(df[df_t]) bill.to_excel('Survey.xlsx', index=False)
I think if you do: b = ['q1 survey', 'q2 survey','q3 survey'] #Sheet Names list_col = ["Month","Date", "Year"] #column Name xls = "path_to_file/R.xls" #create the empty df named bill to append after bill= pd.DataFrame(columns = list_col) for sheet in b: # read the sheet df=pd.read_excel(xls,sheet) df.rename(columns=lambda x: x.strip().upper(), inplace=True) # need to assign bill again bill=bill.append(df[list_col]) # to excel bill.to_excel('Survey.xlsx', index=False) it should work and correct the errors in your code, but you can do a bit differently using pd.concat: list_sheet = ['q1 survey', 'q2 survey','q3 survey'] #Sheet Names list_col = ["Month","Date", "Year"] #column Name # read once the xls file and then access the sheet in the loop, should be faster xls_file = pd.ExcelFile("path_to_file/R.xls") #create a list to append the df list_df_to_concat = [] for sheet in list_sheet : # read the sheet df= pd.read_excel(xls_file, sheet) df.rename(columns=lambda x: x.strip().upper(), inplace=True) # append the df to the list list_df_to_concat.append(df[list_col]) # to excel pd.concat(list_df_to_concat).to_excel('Survey.xlsx', index=False)