looping many excels into seperate dataframes - python-3.x

I am relatively new to Python 3 and i need help on looping in 4 excel spreedsheets as seperate df. i can do this by manually typing in pd.read_excel(filepath, index_col=0) for each of the 4 filepaths but i was looking for a robust way to simply loop in all my filepaths that i have and create a df per filepath (excel spreedsheet). can anyone help me? thanks
filepath = '/Users/User/Desktop/Tax\Year.xlsx'
filepath2 = '/Users/User/Desktop/Tax\Year2.xlsx'
filepath3 = '/Users/User/Desktop/Tax\Year3.xlsx'
filepath4 = '/Users/User/Desktop/Tax\Year4.xlsx'
df = pd.read_excel(filepath, index_col=0)
df2 = pd.read_excel(filepath2, index_col=0)
df3 = pd.read_excel(filepath3, index_col=0)
df4 = pd.read_excel(filepath4, index_col=0)

I would put a '1' on the first filepath.
dict_of_dfs = {}
for n in range(1,5):
filepath = '/Users/User/Desktop/Tax/Year' + str(n) + '.xlsx'
df = pd.read_excel(filepath, index_col=0)
dict_of_dfs[n] = df
# retrieve your dfs...
df1 = dict_of_dfs[1]
df2 = dict_of_dfs[2]
# etc...
Further to OPs question below. Use walk from os library to be able to load filenames from a directory and then walk through them in a list
from os import walk
directory = '/Users/User/Desktop/Tax/Year'
f = []
for (dirpath, dirnames, filenames) in walk(directory):
f.extend(filenames)
break
Then you can access your filenames through a for loop:
for n in f:
filepath = '/Users/User/Desktop/Tax/Year' + n
df = pd.read_excel(filepath, index_col=0)
dict_of_dfs[n] = df

Related

Perform code on multiple files 1 by 1 pandas

Hi I have code I have written to read a .csv file in a folder and add some required columns.
I now want to perform this code on multiple files within the path folder 1 by 1 and save each as a separate df.
My current code is as follows
import pandas as pd
import glob
import os
path = r'C:\Users\jake.jennings.BRONCO\Desktop\GPS Reports\Games\Inputs\2022-03-27 Vs
Cowboys\Test' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
frame = pd.read_csv(filename, index_col=None, skiprows=8)
li.append(frame)
frame = pd.concat(li, axis=0, ignore_index=True)
frame['filename'] = os.path.basename
#Add odometer change and turn all accel values to positive
import numpy as np
frame['OdChange'] = frame['Odometer'].diff()
frame['accelpos'] = frame['Acceleration'].abs()
#Add column with OdChange # >5.5m/s
frame["new1"] = np.where(
(frame.Velocity >=5.5),
frame["OdChange"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new2"] = np.where(
(frame.accelpos >=2.5),
frame["accelpos"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new3"] = np.where(
(frame.Acceleration >=2.5),
'1',
'0')
s = frame['new3'].astype(int)
frame['new4'] = s.diff().fillna(s).eq(1).astype(int)
frame['new4']
#m/min peaks
frame['1minOD'] = frame['OdChange'].rolling(window=600, axis=0).sum()
#HSm/min peaks
frame['1minHS'] = frame['new1'].rolling(window=600, axis=0).sum()
#AccImpulse/min
frame['1minImp'] = frame['accelpos'].rolling(window=600, axis=0).mean() *60
#AccDec Peak Count
frame['1minAccCount'] = frame['new4'].rolling(window=600, axis=0).sum()
print (frame)
I am not sure if this is even the best way to do what I am trying to do. Any help would be appreciated!

df.to_excel capture only last request of iteration pandas

I am currently trying to iterate through a large dataset and using to.excel() to write in my dataframe to Excel.
My code:
writer = pd.ExcelWriter(r'report.xlsx')
for x in range(3):
slq = ("select date_added, fruit_id from market")
data = pd.read_sql(sql, c)
df = pd.DataFrame(data)
df.to_excel(writer)
writer.save()
When this is run, I am only capturing the 3rd request in my range. Is there a different method that would allow me to capture all 3 requests in my range?
There does not appear to be a ExcelWriter.append method. Instead, make a list of the dataframes and pd.concat at the end.
writer = pd.ExcelWriter(r'report.xlsx')
dfs = []
for x in range(3):
sql = ("select date_added, fruit_id from market")
data = pd.read_sql(sql, c)
df = pd.DataFrame(data)
dfs.append(df)
df = pd.concat(dfs)
df.to_excel(writer)
writer.save()
Alternatively, pd.DataFrame.to_excel does have a startrow argument that could be used to append.
writer = pd.ExcelWriter(r'report.xlsx')
row = 0
for x in range(3):
sql = ("select date_added, fruit_id from market")
data = pd.read_sql(sql, c)
df = pd.DataFrame(data)
df.to_excel(writer, startrow=row)
row += len(df)
writer.save()

Write list of elements to csv file with seprated by space

I have pdf file and extracting table fro it. Ca not share data and so createdone sample of how I want things to be wroked
I have a list
l = [a,bd,c f,e]
I want csv file like
col1 col2
a
bd
c f
e
This seems to work:
import csv
myheaders = ['col1', 'col2']
mydata = [['a'],['bd'],['c', 'f'],['e']]
with open('test.csv', 'w') as target:
writer = csv.writer(target, delimiter=' ')
writer.writerow(myheaders)
writer.writerows(mydata)
Another option I found is convert list into dataframe
df = pd.DataFrame(l)
then split it
df[0].str.split(" ", n = 1, expand = True)

Storing outputdata in CSV using python

I have extracted data from different excel sheets spread in different folders, I have organized the folders numerically from 2015 to 2019 and each folder has twelve subfolders (from 1 to 12) here's my code:
import os
from os import walk
import pandas as pd
path = r'C:\Users\Sarah\Desktop\IOMTest'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
my_files.extend([os.path.join(dirpath, fname) for fname in filenames])
all_sheets = []
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" )
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
print(data_mosul_df)
then I saved the extracted columns in a csv file
def save_frames(frames, output_path):
for frame in frames:
frame.to_csv(output_path, mode='a+', header=False)
if __name__ == '__main__':
frames =[pd.DataFrame(data_mosul_df)]
save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')
My problem is that when I open the csv file it seems that it doesn't store all the data but only the last excel sheet that it has read or sometimes the two last excel sheets. however, when I print my data inside the console (in Spyder) I see that all the data are treated
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
print(data_mosul_df)
the picture below shows the output csv created. I am wondering if it is because from Column A to Column E the information are the same ? so that's why it overwrite ?
I would like to know how to modify the code so that it extract and store the data chronologically from folders (2015 to 2019) taking into accout subfolders (from 1 to 12) in each folder and how to create a csv that stores all the data ? thank you
Rewrite your loop:
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" )
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
#Make a list of df's
all_sheets.append(data_mosul_df)
Rewrite your save_frames:
def save_frames(frames, output_path):
frames.to_csv(output_path, mode='a+', header=False)
Rewrite your main:
if __name__ == '__main__':
frames = pd.concat(all_sheets)
save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')

Saving loop output to multiple excel sheets

I have a csv file full of multiple years of water data. I've broken up each water year into it's own data frame. Now I want to do some math to those water years then save each water year to it's own excel sheet.
The math part of the code is working, but I'm having trouble with the final step of naming and saving the output of the loop correctly. Right now I have it creating the excel file and creating the sheet names correctly, but the loop just saves the final iteration to all the sheets. I've googled around but I can't get any other of the similar questions answers to work. This is my first python program so advice would be appreciated.
import pandas as pd
with open(r'wft.csv') as csvfile:
tdata = pd.read_csv(csvfile)
tdata['date'] = pd.to_datetime(tdata['date'], format='%m/%d/%Y %H:%M')
tdata = tdata.set_index(['date'])
wy2015 = tdata.loc['2014-10-1 00:00' : '2015-7-1 00:00']
wy2016 = tdata.loc['2015-10-1 00:00' : '2016-7-1 00:00']
wy2017 = tdata.loc['2016-10-1 00:00' : '2017-7-1 00:00']
writer = pd.ExcelWriter('WFT.xlsx', engine='xlsxwriter')
wyID = [wy2014, wy2015, wy2016, wy2017]
seq = ['wy2014', 'wy2015', 'wy2016', 'wy2017']
for df in wyID:
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
for name in seq:
df.to_excel(writer, sheet_name= name)
writer.save()
Issues in your code
writer = pd.ExcelWriter('WFT.xlsx', engine='xlsxwriter')
wyID = [wy2014, wy2015, wy2016, wy2017]
seq = ['wy2014', 'wy2015', 'wy2016', 'wy2017']
for df in wyID: # outer loop that figures out wy20xx
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
for name in seq: # you loop through all the names and write all sheets every time. you want to be writing just one
df.to_excel(writer, sheet_name= name)
writer.save()
Instead try this.
for i, df in enumerate(wyID): # outer loop that figures out wy20xx
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
df.to_excel(writer, sheet_name= seq[i]) # writes to correct wy20xx sheet
writer.save() # Now you're done writing the excel

Resources