Python - Load multiple excel files with multiple sheets in it with specific columns - python-3.x

I have a problem scenario where I need to load excel files using Python
Load multiple excel files from a folder - Done
Each excel file has multiple sheets - Done
Need to load only required columns ('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM'), other columns needs to be ignored/dropped and no error should be raised if the above columns does not exist in some sheets.
Load all the data into single data frame
------ Code -------
import pandas as pd
import os
import glob
def getfilepath():
path = 'C:/Users/Tracking Logs/'
files=(os.listdir(path))
allfiles = glob.glob(path+"*.xlsx")
def getdatafromexcel():
for file in allfiles:
rawdf = pd.read_excel(file,sheet_name=None,na_values='null',keep_default_na=False,dtype=object,date_parser=True)
cols=('Receive Date','Process Date','Process Number','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC')
display(df)
getfilepath()
getdatafromexcel()

I found the solution:
import pandas as pd
import os
import glob
from IPython.display import HTML,display
from openpyxl import load_workbook
path = 'C:/Users/Tracking Logs/'
cols = ['Receive Date','Process Date','Task Name','Series','Office','Department','Unit Manager','AM/AA/PC']
def getfilepath(path):
files=(os.listdir(path))
allfiles = glob.glob(path+"*.xlsx")
#print('Allfiles: ',allfiles)
return allfiles
def getdatafromexcel(cols,allfiles):
for i in range(len(allfiles)):
print('\nCounter: ',i,' \nFilenames: ',allfiles[i])
wb = load_workbook(allfiles[i],read_only=True)
for sheetname in wb.sheetnames:
print('Sheetname: ',sheetname)
try:
df = pd.read_excel(allfiles[i],sheet_name=sheetname,na_values='null',usecols=cols,
keep_default_na=False,dtype=object)
Indexnames = df[(df["Task Name"] == '') & (df["Series"] == '') & (df["Office"] == '')].index
df.drop(Indexnames,inplace=True)
display(df)
fulldf=fulldf.append(df,ignore_index=True)
except Exception as e:
print(e)
finally:
print('this executed')
wb.close()
display(fulldf)
allfiles = getfilepath(path)
getdatafromexcel(cols,allfiles)

One can use pd.ExcelFile and pd.read_excel to get the required results.
def getdatafromexcel():
for file in allfiles:
xl = pd.ExcelFile(file)
res = len(xl.sheet_names)
if res>1:
for i in range(1, res+1):
df = pd.read_excel(file, sheet_name= '%d' %i)
# Do selection, preprocessing what you want here
if i == 1:
df.to_csv(<your_path> + '1.csv')
df_1 = pd.read_csv(<your_path> + '1.csv')
if i > 1:
df_1 = pd.concat([df_1, df])
else:
df_1 = pd.read_excel(file)
# Do selection, preprocessing what you what here
df_1.to_csv(<your_path> + '.csv', index= False)

Related

Perform code on multiple files 1 by 1 pandas

Hi I have code I have written to read a .csv file in a folder and add some required columns.
I now want to perform this code on multiple files within the path folder 1 by 1 and save each as a separate df.
My current code is as follows
import pandas as pd
import glob
import os
path = r'C:\Users\jake.jennings.BRONCO\Desktop\GPS Reports\Games\Inputs\2022-03-27 Vs
Cowboys\Test' # use your path
all_files = glob.glob(path + "/*.csv")
li = []
for filename in all_files:
frame = pd.read_csv(filename, index_col=None, skiprows=8)
li.append(frame)
frame = pd.concat(li, axis=0, ignore_index=True)
frame['filename'] = os.path.basename
#Add odometer change and turn all accel values to positive
import numpy as np
frame['OdChange'] = frame['Odometer'].diff()
frame['accelpos'] = frame['Acceleration'].abs()
#Add column with OdChange # >5.5m/s
frame["new1"] = np.where(
(frame.Velocity >=5.5),
frame["OdChange"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new2"] = np.where(
(frame.accelpos >=2.5),
frame["accelpos"],
'0')
#Add column with accels/decels >2.5m.s.s for AccelDec/min
frame["new3"] = np.where(
(frame.Acceleration >=2.5),
'1',
'0')
s = frame['new3'].astype(int)
frame['new4'] = s.diff().fillna(s).eq(1).astype(int)
frame['new4']
#m/min peaks
frame['1minOD'] = frame['OdChange'].rolling(window=600, axis=0).sum()
#HSm/min peaks
frame['1minHS'] = frame['new1'].rolling(window=600, axis=0).sum()
#AccImpulse/min
frame['1minImp'] = frame['accelpos'].rolling(window=600, axis=0).mean() *60
#AccDec Peak Count
frame['1minAccCount'] = frame['new4'].rolling(window=600, axis=0).sum()
print (frame)
I am not sure if this is even the best way to do what I am trying to do. Any help would be appreciated!

In python, how to concatenate corresponding sheets in multiple excel files

How do I concatenate multiple xlsx files with the same sheet_names. For example,
I have 3 xlsx files, Rob_schedule.xlsx, Mike_schdule.xlsx and Jerome_schedule.xlsx.
Each file has the following sheet/tab names : home, office & school.
The code below generates the 3 xlsx files ( you can copy + paste and run to generate the excel files)
##############################Generating the data for Rob_schedule.xlsx########################
import pandas as pd
import numpy as np
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['AP_Bio','AP_Chem','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ6','MQ2','MQ8'],
'Lunch':[1,1,1,3],
'code':['java','python','C','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['C','B','D','B'],
'Laundry':['color','white','White','color'],
'cleaning':['balcony','garage','restroom','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
import pandas as pd
#initialze the excel writer
writer = pd.ExcelWriter('Rob_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items():
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
################################ generating Mike_schedule.xlsx###################################
import pandas as pd
import numpy as np
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['AP_Bio','AP_Chem','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ2','MQ4','MQ5'],
'Lunch':[1,1,1,3],
'code':['javascript','R','C','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['A','B','D','B'],
'Laundry':['color','white','white','color'],
'cleaning':['patio','garage','living_room','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
#initialze the excel writer
writer = pd.ExcelWriter('Mike_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items(): # .use .items for python 3.X
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
######################### Generate Jerome schedule###########################################
df= {
'Date':[10232020,10242020,10252020,10262020],
'Class':['French','Math','Physics','History'],
'Period':[3,1,2,4]}
school = pd.DataFrame(df,columns = ['Date','Class','Period'])
school
df2= {
'Date':[10232020,10242020,10252020,10262020],
'Meeting':['MQ1','MQ2','MQ4','MQ5'],
'Lunch':[1,1,1,3],
'code':['javascript','python','R','C++']}
office = pd.DataFrame(df2,columns = ['Date','Meeting','Lunch','code'])
office
df3= {
'cooking':['X','B','D','C'],
'Laundry':['color','white','white','color'],
'cleaning':['patio','garage','living_room','bathroom']}
home = pd.DataFrame(df3,columns = ['cooking','Laundry','cleaning'])
home
import pandas as pd
#initialze the excel writer
writer = pd.ExcelWriter('Jerome_schedule.xlsx', engine='xlsxwriter')
#store your dataframes in a dict, where the key is the sheet name you want
frames = {'home':home, 'office':office,
'school':school}
#now loop thru and put each on a specific sheet
for sheet, frame in frames.items(): # .use .items for python 3.X
frame.to_excel(writer, sheet_name = sheet,index = False)
#critical last step
writer.save()
I want to
concatenate the corresponding sheets/tabs :home, office, and school for Rob_schedule.xlsx,Mike_schedule.xlsx & Jerome_schedule.xlsx
export the concatenated dataframes as family_schedule.xlsx with home, office and school tabs
My attempt:
# This code concatenates all the tabs into one tab, but what I want is to concatenate all by their corresponding sheet/tab names
import pandas as pd
path = os.chdir(r'mypath\\')
files = os.listdir(path)
files
# pull files with `.xlsx` extension
excel_files = [file for file in files if '.xlsx' in file]
excel_files
def create_df_from_excel(file_name):
file = pd.ExcelFile(file_name)
names = file.sheet_names
return pd.concat([file.parse(name) for name in names])
df = pd.concat(
[create_df_from_excel(xl) for xl in excel_files]
)
# save the data frame
writer = pd.ExcelWriter('family_reschedule.xlsx')
df.to_excel(writer, '')
writer.save()
I would iterate over each file, and then over each worksheet, adding each sheet to a different list based on the sheet name.
Then you'll have a structure like...
{
'sheet1': [df_file1_sheet1, df_file2_sheet1, df_file3_sheet1],
'sheet2': [df_file1_sheet2, df_file2_sheet2, df_file3_sheet2],
'sheet3': [df_file1_sheet3, df_file2_sheet3, df_file3_sheet3],
}
Then concatenate each list in to a single dataframe, them write the three dataframes to an excel file.
# This part is just your own code, I've added it here because you
# couldn't figure out where `excel_files` came from
#################################################################
import os
import pandas as pd
path = os.chdir(r'mypath\\')
files = os.listdir(path)
files
# pull files with `.xlsx` extension
excel_files = [file for file in files if '.xlsx' in file]
excel_files
# This part is my actual answer
###############################
from collections import defaultdict
worksheet_lists = defaultdict(list)
for file_name in excel_files:
workbook = pd.ExcelFile(file_name)
for sheet_name in workbook.sheet_names:
worksheet = workbook.parse(sheet_name)
worksheet['source'] = file_name
worksheet_lists[sheet_name].append(worksheet)
worksheets = {
sheet_name: pd.concat(sheet_list)
for (sheet_name, sheet_list)
in worksheet_lists.items()
}
writer = pd.ExcelWriter('family_reschedule.xlsx')
for sheet_name, df in worksheets.items():
df.to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
Consider building a list of concatenated data frames with list/dict comprehensions by running an outer iteration across sheet names and inner iteration across workbooks:
import pandas as pd
path = "/path/to/workbooks"
workbooks = [f for f in os.listdir(path) if f.endswith(".xlsx")]
sheets = ["home", "office", "school"]
df_dicts = {
sh: pd.concat(
[pd.read_excel(os.path.join(path, wb), sheet_name=sh)
for wb in workbooks]
)
for sh in sheets
}
Then, export to single file:
with pd.ExcelWriter('family_reschedule.xlsx') as writer:
for sh, df in df_dict.items():
df.to_excel(writer, sheet_name=sh, index=False)
writer.save()

Trouble reading csvs saved in sharefile (citrix)

I wrote the following code to create dataframes from files saved in sharefile. It works perfectly for excel files, but fails for csv files with the error EmptyDataError: No columns to parse from file.
tblname = 'test'
fPth = r'Z:\Favorites\test10 (Group D - Custom EM&V)\8 PII\16 - Project Selection Plan\QC\Data\test.csv'
sht = 'Gross_Data'
shtStart = 0
fType = 'csv'
fitem = sfsession.get_io_version(fPth)
if fitem is None:
print(f'Could not create sharefile item for {fPth}')
else:
try:
if fType == 'csv':
df = pd.read_csv(fitem.io_data, header = shtStart)
elif fType == 'excel':
df = pd.read_excel(fitem.io_data, sheet_name = sht, header = shtStart)
else:
pass
print(f'Data import COMPLETE for {fPth}: {str(datetime.now())}')
except:
print(f'Data import FAILED for {fPth}')
logging.critical(f'Data import FAILED for {fPth}')
If I replace fitem.io_data with fPth in df = pd.read_csv, the code works, but I can't use that as a permanent solution. Any suggestions?
Also sfsession is a sharefile session and get_io_version(fPth) gets the token and downloads all the file properties include its data.
Thanks.
An adaptation of this solution worked for me:
StringIO and pandas read_csv
I added fitem.io_data.seek(0) before the df = ... line
Closing the question.

BeautifulSoup, Requests, Dataframe, extracting from <SPAN> and Saving to Excel

Python novice here again! 2 questions:
1) Instead of saving to multiple tabs (currently saving each year to a tab named after the year) how can I save all this data into one sheet in excel called "summary".
2) ('div',class_="sidearm-schedule-game-result") returns the format "W, 1-0". How can I split the "W, 1-0" into two columns, one containing "W" and the next column containing "1-0".
Thanks so much
import requests
import pandas as pd
from pandas import ExcelWriter
from bs4 import BeautifulSoup
import openpyxl
import csv
year_id = ['2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
lehigh_url = 'https://lehighsports.com/sports/mens-soccer/schedule/'
results = []
with requests.Session() as req:
for year in range(2003, 2020):
print(f"Extracting Year# {year}")
url = req.get(f"{lehigh_url}{year}")
if url.status_code == 200:
soup = BeautifulSoup(url.text, 'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results.append(sheet)
def save_xls(list_dfs, xls_path):
with ExcelWriter(xls_path) as writer:
for n, df in enumerate(list_dfs):
df.to_excel(writer,'%s' %year_id[n],index=False,)
writer.save()
save_xls(results,'lehigh.xlsx')
Instead of creating a list of dataframes, you can append each sheet into 1 dataframe and write that to file with pandas. Then to split into 2 columns, just use .str.split() and split on the comma.
import requests
import pandas as pd
from bs4 import BeautifulSoup
year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']
results = pd.DataFrame()
for year in year_id:
url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
print (url)
lehigh = requests.get(url).text
soup = BeautifulSoup(lehigh,'lxml')
rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
sheet = pd.DataFrame()
for row in rows:
date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()
try:
result = row.find('div',class_="sidearm-schedule-game-result").text.strip()
except:
result = ''
df = pd.DataFrame([[year,date,name,opp,conf,result]], columns=['year','date','opponent','list','conference','result'])
sheet = sheet.append(df,sort=True).reset_index(drop=True)
results = results.append(sheet, sort=True).reset_index(drop=True)
results['result'], results['score'] = results['result'].str.split(',', 1).str
results.to_excel('lehigh.xlsx')

Import and parse .data file

there is a file I tried to import and safe as pandas df. At a first sight looks like it's already columns and rows ordered, but finally I had to do a bunch of stuff to create pandas df. Could you please check if there is much faster way to manage it?
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
My way of doing it is:
import requests
import pandas as pd
r = requests.get(url)
file = r.text
step_1 = file.split('\n')
for n in range(len(step_1)): # remove empty strings
if bool(step_1[n]) == False:
del(step_1[n])
step_2 = [i.split('\t') for i in step_1]
cars_names = [i[1] for i in step_2]
step_3 = [i[0].split(' ') for i in step_2]
for e in range(len(step_3)): # remove empty strings in each sublist
step_3[e] = [item for item in step_3[e] if item != '']
mpg = [i[0] for i in step_3]
cylinders = [i[1] for i in step_3]
disp = [i[2] for i in step_3]
horsepower = [i[3] for i in step_3]
weight = [i[4] for i in step_3]
acce = [i[5] for i in step_3]
year = [i[6] for i in step_3]
origin = [i[7] for i in step_3]
list_cols = [cars_names, mpg, cylinders, disp, horsepower, weight, acce, year, origin]
# list_labels written manually:
list_labels = ['car name', 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin']
zipped = list(zip(list_labels, list_cols))
data = dict(zipped)
df = pd.DataFrame(data)
When you replaced \t to blankspace, you can use read_csv to read it. But you need to wrap up your text, because the first parameter in read_csv is filepath_or_buffer which needs object with a read() method (such as a file handle or StringIO). Then your question can be transform to read_csv doesn't read the column names correctly on this file?
import requests
import pandas as pd
from io import StringIO
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
r = requests.get(url)
file = r.text.replace("\t"," ")
# list_labels written manually:
list_labels = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin','car name']
df = pd.read_csv(StringIO(file),sep="\s+",header = None,names=list_labels)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)

Resources