Storing outputdata in CSV using python - excel
I have extracted data from different excel sheets spread in different folders, I have organized the folders numerically from 2015 to 2019 and each folder has twelve subfolders (from 1 to 12) here's my code:
import os
from os import walk
import pandas as pd
path = r'C:\Users\Sarah\Desktop\IOMTest'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
my_files.extend([os.path.join(dirpath, fname) for fname in filenames])
all_sheets = []
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" )
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
print(data_mosul_df)
then I saved the extracted columns in a csv file
def save_frames(frames, output_path):
for frame in frames:
frame.to_csv(output_path, mode='a+', header=False)
if __name__ == '__main__':
frames =[pd.DataFrame(data_mosul_df)]
save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')
My problem is that when I open the csv file it seems that it doesn't store all the data but only the last excel sheet that it has read or sometimes the two last excel sheets. however, when I print my data inside the console (in Spyder) I see that all the data are treated
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
print(data_mosul_df)
the picture below shows the output csv created. I am wondering if it is because from Column A to Column E the information are the same ? so that's why it overwrite ?
I would like to know how to modify the code so that it extract and store the data chronologically from folders (2015 to 2019) taking into accout subfolders (from 1 to 12) in each folder and how to create a csv that stores all the data ? thank you
Rewrite your loop:
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" )
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
#Make a list of df's
all_sheets.append(data_mosul_df)
Rewrite your save_frames:
def save_frames(frames, output_path):
frames.to_csv(output_path, mode='a+', header=False)
Rewrite your main:
if __name__ == '__main__':
frames = pd.concat(all_sheets)
save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')
Related
Append values to Dataframe in loop and if conditions
Need help please. I have a dataframe that reads rows from Excel and appends to Dataframe if certain columns exist. I need to add an additional Dataframe if the columns don't exist in a sheet and append filename and sheetname and write all the file names and sheet names for those sheets to an excel file. Also I want the values to be unique. I tried adding to dfErrorList but it only showed the last sheetname and filename and repeated itself many times in the output excel file from xlsxwriter import Workbook import pandas as pd import openpyxl import glob import os path = 'filestoimport/*.xlsx' list_of_dfs = [] list_of_dferror = [] dfErrorList = pd.DataFrame() #create empty df for filepath in glob.glob(path): xl = pd.ExcelFile(filepath) # Define an empty list to store individual DataFrames for sheet_name in xl.sheet_names: df = pd.read_excel(filepath, sheet_name=sheet_name) df['sheetname'] = sheet_name file_name = os.path.basename(filepath) df['sourcefilename'] = file_name if "Project ID" in df.columns and "Status" in df.columns: print('') *else: dfErrorList['sheetname'] = df['sheetname'] # adds `sheet_name` into the column dfErrorList['sourcefilename'] = df['sourcefilename'] continue list_of_dferror.append((dfErrorList)) df['Status'].fillna('', inplace=True) df['Added by'].fillna('', inplace=True) list_of_dfs.append(df) # # Combine all DataFrames into one data = pd.concat(list_of_dfs, ignore_index=True) dataErrors = pd.concat(list_of_dferror, ignore_index=True) dataErrors.to_excel(r'error.xlsx', index=False) # data.to_excel("total_countries.xlsx", index=None)
In python, how to concatenate corresponding sheets in multiple excel files
How do I concatenate multiple xlsx files with the same sheet_names. For example, I have 3 xlsx files, Rob_schedule.xlsx, Mike_schdule.xlsx and Jerome_schedule.xlsx. Each file has the following sheet/tab names : home, office & school. The code below generates the 3 xlsx files ( you can copy + paste and run to generate the excel files) ##############################Generating the data for Rob_schedule.xlsx######################## import pandas as pd import numpy as np df= { 'Date':[10232020,10242020,10252020,10262020], 'Class':['AP_Bio','AP_Chem','Physics','History'], 'Period':[3,1,2,4]} school = pd.DataFrame(df,columns = ['Date','Class','Period']) school df2= { 'Date':[10232020,10242020,10252020,10262020], 'Meeting':['MQ1','MQ6','MQ2','MQ8'], 'Lunch':[1,1,1,3], 'code':['java','python','C','C++']} office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code']) office df3= { 'cooking':['C','B','D','B'], 'Laundry':['color','white','White','color'], 'cleaning':['balcony','garage','restroom','bathroom']} home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning']) home import pandas as pd #initialze the excel writer writer = pd.ExcelWriter('Rob_schedule.xlsx', engine='xlsxwriter') #store your dataframes in a dict, where the key is the sheet name you want frames = {'home':home, 'office':office, 'school':school} #now loop thru and put each on a specific sheet for sheet, frame in frames.items(): frame.to_excel(writer, sheet_name = sheet,index = False) #critical last step writer.save() ################################ generating Mike_schedule.xlsx################################### import pandas as pd import numpy as np df= { 'Date':[10232020,10242020,10252020,10262020], 'Class':['AP_Bio','AP_Chem','Physics','History'], 'Period':[3,1,2,4]} school = pd.DataFrame(df,columns = ['Date','Class','Period']) school df2= { 'Date':[10232020,10242020,10252020,10262020], 'Meeting':['MQ1','MQ2','MQ4','MQ5'], 'Lunch':[1,1,1,3], 'code':['javascript','R','C','C++']} office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code']) office df3= { 'cooking':['A','B','D','B'], 'Laundry':['color','white','white','color'], 'cleaning':['patio','garage','living_room','bathroom']} home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning']) home #initialze the excel writer writer = pd.ExcelWriter('Mike_schedule.xlsx', engine='xlsxwriter') #store your dataframes in a dict, where the key is the sheet name you want frames = {'home':home, 'office':office, 'school':school} #now loop thru and put each on a specific sheet for sheet, frame in frames.items(): # .use .items for python 3.X frame.to_excel(writer, sheet_name = sheet,index = False) #critical last step writer.save() ######################### Generate Jerome schedule########################################### df= { 'Date':[10232020,10242020,10252020,10262020], 'Class':['French','Math','Physics','History'], 'Period':[3,1,2,4]} school = pd.DataFrame(df,columns = ['Date','Class','Period']) school df2= { 'Date':[10232020,10242020,10252020,10262020], 'Meeting':['MQ1','MQ2','MQ4','MQ5'], 'Lunch':[1,1,1,3], 'code':['javascript','python','R','C++']} office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code']) office df3= { 'cooking':['X','B','D','C'], 'Laundry':['color','white','white','color'], 'cleaning':['patio','garage','living_room','bathroom']} home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning']) home import pandas as pd #initialze the excel writer writer = pd.ExcelWriter('Jerome_schedule.xlsx', engine='xlsxwriter') #store your dataframes in a dict, where the key is the sheet name you want frames = {'home':home, 'office':office, 'school':school} #now loop thru and put each on a specific sheet for sheet, frame in frames.items(): # .use .items for python 3.X frame.to_excel(writer, sheet_name = sheet,index = False) #critical last step writer.save() I want to concatenate the corresponding sheets/tabs :home, office, and school for Rob_schedule.xlsx,Mike_schedule.xlsx & Jerome_schedule.xlsx export the concatenated dataframes as family_schedule.xlsx with home, office and school tabs My attempt: # This code concatenates all the tabs into one tab, but what I want is to concatenate all by their corresponding sheet/tab names import pandas as pd path = os.chdir(r'mypath\\') files = os.listdir(path) files # pull files with `.xlsx` extension excel_files = [file for file in files if '.xlsx' in file] excel_files def create_df_from_excel(file_name): file = pd.ExcelFile(file_name) names = file.sheet_names return pd.concat([file.parse(name) for name in names]) df = pd.concat( [create_df_from_excel(xl) for xl in excel_files] ) # save the data frame writer = pd.ExcelWriter('family_reschedule.xlsx') df.to_excel(writer, '') writer.save()
I would iterate over each file, and then over each worksheet, adding each sheet to a different list based on the sheet name. Then you'll have a structure like... { 'sheet1': [df_file1_sheet1, df_file2_sheet1, df_file3_sheet1], 'sheet2': [df_file1_sheet2, df_file2_sheet2, df_file3_sheet2], 'sheet3': [df_file1_sheet3, df_file2_sheet3, df_file3_sheet3], } Then concatenate each list in to a single dataframe, them write the three dataframes to an excel file. # This part is just your own code, I've added it here because you # couldn't figure out where `excel_files` came from ################################################################# import os import pandas as pd path = os.chdir(r'mypath\\') files = os.listdir(path) files # pull files with `.xlsx` extension excel_files = [file for file in files if '.xlsx' in file] excel_files # This part is my actual answer ############################### from collections import defaultdict worksheet_lists = defaultdict(list) for file_name in excel_files: workbook = pd.ExcelFile(file_name) for sheet_name in workbook.sheet_names: worksheet = workbook.parse(sheet_name) worksheet['source'] = file_name worksheet_lists[sheet_name].append(worksheet) worksheets = { sheet_name: pd.concat(sheet_list) for (sheet_name, sheet_list) in worksheet_lists.items() } writer = pd.ExcelWriter('family_reschedule.xlsx') for sheet_name, df in worksheets.items(): df.to_excel(writer, sheet_name=sheet_name, index=False) writer.save()
Consider building a list of concatenated data frames with list/dict comprehensions by running an outer iteration across sheet names and inner iteration across workbooks: import pandas as pd path = "/path/to/workbooks" workbooks = [f for f in os.listdir(path) if f.endswith(".xlsx")] sheets = ["home", "office", "school"] df_dicts = { sh: pd.concat( [pd.read_excel(os.path.join(path, wb), sheet_name=sh) for wb in workbooks] ) for sh in sheets } Then, export to single file: with pd.ExcelWriter('family_reschedule.xlsx') as writer: for sh, df in df_dict.items(): df.to_excel(writer, sheet_name=sh, index=False) writer.save()
Python for the Comparison of excel column elements and print the matched elements in separate column
I have developed the following code and fetched the matched output using a for loop.I need to print these output elements in separate column using python. excel file name - Sample_data.xlsx first column - WBS_CODE second column - PROJECT_CODE first column and second column are matched and then printed in separate column (column F) using python code. Please find my below code, import pandas as pd A = pd.read_excel("D:\python_work\Sample_data.xlsx", sheet_name = '31Sep') code = A['WBS_CODE'].tolist() B = pd.read_excel("D:\python_work\Sample_data.xlsx", sheet_name = '4Dec') code1 = B['PROJECT_CODE'].tolist() for x in code1: if x in code: print(x) else: print("NA") output: NA NA NA APP-ACI-PJ-APAC-EMEA-ENG NA NA
I have found a way to export the output and print them in a separate column in excel sheet. Below is the solution, import pandas as pd from openpyxl import load_workbook # Reading the Excel file columns A = pd.read_excel("D:\python_work\Sample_data.xlsx", sheet_name='4Dec') code = A['PROJECT_CODE'].tolist() B = pd.read_excel("D:\python_work\Sample_data.xlsx", sheet_name='31Sep') code1 = B['WBS_CODE'].tolist() # Comparison of columns class test: def loop(self): result = [] for x in code1: if x in code: result.append(x) else: y = "NA" result.append(y) print(result) # Printing data into Excel try: book = load_workbook('D:\python_work\Aew1.xlsx') writer = pd.ExcelWriter('D:\python_work\Aew1.xlsx', engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) # loading all the worksheets in opened Excel df = pd.DataFrame.from_dict({'Column1': result}) df.to_excel(writer, sheet_name='Sheet1', startcol=19) writer.save() except FileNotFoundError: print("File Not found: 'Check the Name or Existence of file specified/'") except PermissionError: print("File Opened/No-Access: Check whether you have access to file or file is opened") test().loop() steps that solved: 1. Appended the for loop output to a list 2. used openpyxl library to print the output to a column in excel worksheet. Thanks guyz for help and support. Have a good day
Concatenate two data frames with the same name from different directories via Python
I have two folders in different directories that contain excel files with the same name. I need to concatenate the excel files, not sheets, because each file has one sheet, that have the same name. for example: d1 contains A, B, C files d2 contains A, B, E, F file The output should contain two files, the result from concatenating A from d1 and A from d2, and the result from concatenating B from d1 and B from d2. If d1 and d2 have no files with the same name, there will be no output. the first data frame has multiple rows but the second data frame has one row. When I run this code, it considers the second data frame as a header, and when I set the header as false it no longer takes it under consideration. import glob import numpy as np file_d1 = glob.glob(d1 + "/*.xlsx") file_d2 = glob.glob(d2 + "/*.xlsx") i=0 for file_d1 in glob.glob(d1 + "/*.xlsx"): fileName_d1 = os.path.splitext(os.path.splitext(os.path.basename(file_d1))[0])[0] for file_d2 in glob.glob(d2 + "/*.xlsx"): fileName_d2 = os.path.splitext(os.path.splitext(os.path.basename(file_d2))[0])[0] if fileName_d1 == fileName_d2: i+=1 fileName_1=d1+'/'+fileName_d1+'.xlsx' df1 = pd.read_excel(fileName_1, header=0, index= False) fileName_2=d2+'/'+fileName_d2+'.xlsx' df2 = pd.read_excel(fileName_2, header=0, index= False) print(fileName_1) print(fileName_2) df = pd.DataFrame(np.concatenate([df1.values, df2.values], axis=0), columns=df2.columns) df.to_excel('C:/Users/khouloud.ayari/Desktop/FinalResult/output'+str(i)+'.xlsx', index = True, header=False) What should I do to get the correct output, which is (1st data frame + 2nd data frame) not the other way around? when I concatenated the two files, the "nbr de Reboot" was gone and the "nbr de km parcourus" was considered as a header df.to_excel('C:/Users/khouloud.ayari/Desktop/FinalResult/output'+str(i)+'.xlsx', index = False, header=True) output via console 1 nbr de Kilomètres parcourus 1 0 Passage en mode privé 1 1 Passage en mode public 2 output .xlsx expected output: A and B are three different files PS: I'm using Python 3.7(Spyder)
My goal was to concatenate two excel files with the same name located in different directories but I failed to do so because some data was lost in the process, so I added headers to my excel files and I run the code below: import os import pandas as pd d2 = "C:/Users/khouloud/Desktop/d2" d1 = "C:/Users/khouloud/Desktop/d1" import glob import numpy as np file_d1 = glob.glob(d1 + "/*.xlsx") file_d2 = glob.glob(d2 + "/*.xlsx") i=0 for file_d1 in glob.glob(d1 + "/*.xlsx"): fileName_d1 = os.path.splitext(os.path.splitext(os.path.basename(file_d1))[0])[0] for file_d2 in glob.glob(d2 + "/*.xlsx"): fileName_d2 = os.path.splitext(os.path.splitext(os.path.basename(file_d2))[0])[0] if fileName_d1 == fileName_d2: i+=1 fileName_1=d1+'/'+fileName_d1+'.xlsx' df1 = pd.read_excel(fileName_1, header=0, index= False) fileName_2=d2+'/'+fileName_d2+'.xlsx' df2 = pd.read_excel(fileName_2, header=0, index= False) df = pd.DataFrame(np.concatenate([df1.values, df2.values]), columns=df2.columns) print('**********************************************') print('1',df) print('**********************************************') df.to_excel('C:/Users/khouloud/Desktop/FinalResult/output'+str(i)+'.xlsx', index = False, header=False)
How to apply this pandas.Series code to compare several files within a folder
I already have this code that finds all the csv files in a folder and reads them in: directory = os.fsencode(folderpath) os.chdir(directory) for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith(".csv"): df1 = pd.read_csv(filename)[columnName] Now I have code that can find all the rows that are found in every single csv file that I input: match = pd.Series(list(set(file1.columnName) & set(file2.columnName) & set(file3.columnName) & set(file4.columnName))) How can I merge the two pieces of code above to find all the rows that are found in every single csv file within a folder and return the matches in a single pandas dataframe?
I think you can create list of all Series first and then dynamically find matches by reduce: #data from previous answer vals = [] directory = os.fsencode(folderpath) os.chdir(directory) for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith(".csv"): df1 = pd.read_csv(filename)['Name'] vals.append(df1) from functools import reduce a = reduce(lambda x, y: set(x) & set(y), vals) print (a) {'Ben', 'Neil'} df = pd.DataFrame({'col':list(a)}) print (df) col 0 Ben 1 Neil