Related
Let's say we have many excel files with the multiple sheets as follows:
Sheet 1: 2021_q1_bj
a b c d
0 1 2 23 2
1 2 3 45 5
Sheet 2: 2021_q2_bj
a b c d
0 1 2 23 6
1 2 3 45 7
Sheet 3: 2019_q1_sh
a b c
0 1 2 23
1 2 3 45
Sheet 4: 2019_q2_sh
a b c
0 1 2 23
1 2 3 40
I wish to append all the sheets to one if the last part split by _ of sheet names are same across all excel files. ie., sheet 1 will append with sheet 2 since their both have common bj, if another excel file also have sheets with name bj, it will also be append to this one, same logic for sheet 3 and sheet 4.
How could I achieve that in Pandas or other Python packages?
The expected result for current excel file would be:
bj:
a b c d
0 1 2 23 2
1 2 3 45 5
2 1 2 23 6
3 2 3 45 7
sh:
a b c
0 1 2 23
1 2 3 45
2 1 2 23
3 2 3 40
Code for reference:
import os, glob
import pandas as pd
files = glob.glob("*.xlsx")
for each in files:
dfs = pd.read_excel(each, sheet_name=None, index_col=[0])
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'Out_{n}.xlsx', index=False)
Update:
You may download test excel files and final expected result from this link.
Try:
dfs = pd.read_excel('Downloads/WS_1.xlsx', sheet_name=None, index_col=[0])
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'Out_{n}.xlsx')
Update
import os, glob
import pandas as pd
files = glob.glob("Downloads/test_data/*.xlsx")
writer = pd.ExcelWriter('Downloads/test_data/Output_file.xlsx', engine='xlsxwriter')
excel_dict = {}
for each in files:
dfs = pd.read_excel(each, sheet_name=None, index_col=[0])
excel_dict.update(dfs)
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(writer, index=False, sheet_name=f'{n}')
writer.save()
writer.close()
I have achieved the whole process and get the final expected result with the code below.
Thanks to provide alternative and more concise solutions or give me some advices if it's possible:
import os, glob
import pandas as pd
from pandas import ExcelWriter
from datetime import datetime
def save_xls(dict_df, path):
writer = ExcelWriter(path)
for key in dict_df:
dict_df[key].to_excel(writer, key, index=False)
writer.save()
root_dir = './original/'
for root, subFolders, files in os.walk(root_dir):
# print(subFolders)
for file in files:
if '.xlsx' in file:
file_path = os.path.join(root_dir, file)
print(file)
f = pd.ExcelFile(file_path)
dict_dfs = {}
for sheet_name in f.sheet_names:
df_new = f.parse(sheet_name = sheet_name)
print(sheet_name)
## get the year and quarter from the sheet name
year, quarter, city = sheet_name.split("_")
# year, quarter, city = sheet_name.split("_")
df_new["year"] = year
df_new["quarter"] = quarter
df_new["city"] = city
dict_dfs[sheet_name] = df_new
save_xls(dict_df = dict_dfs, path = './add_columns_from_sheet_name/' + "new_" + file)
root_dir = './add_columns_from_sheet_name/'
list1 = []
df = pd.DataFrame()
for root, subFolders, files in os.walk(root_dir):
# print(subFolders)
for file in files:
if '.xlsx' in file:
# print(file)
city = file.split('_')[0]
# print(file)
file_path = os.path.join(root_dir, file)
# print(file_path)
dfs = pd.read_excel(file_path, sheet_name=None)
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
print(n)
timestr = datetime.utcnow().strftime('%Y%m%d-%H%M%S%f')[:-3]
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'./output/{n}_{timestr}.xlsx', index=False)
file_set = set()
file_dir = './output/'
file_list = os.listdir(file_dir)
for file in file_list:
data_type = file.split('_')[0]
file_set.add(data_type)
print(file_set)
file_dir = './output'
file_list = os.listdir(file_dir)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
file_set = set()
for file in file_list:
if '.xlsx' in file:
# print(file)
df_temp = pd.read_excel(os.path.join(file_dir, file))
if 'bj' in file:
df1 = df1.append(df_temp)
elif 'sh' in file:
df2 = df2.append(df_temp)
elif 'gz' in file:
df3 = df3.append(df_temp)
elif 'sz' in file:
df4 = df4.append(df_temp)
# function
def dfs_tabs(df_list, sheet_list, file_name):
writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
for dataframe, sheet in zip(df_list, sheet_list):
dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0, index=False)
writer.save()
# list of dataframes and sheet names
dfs = [df1, df2, df3, df4]
sheets = ['bj', 'sh', 'gz', 'sz']
# run function
dfs_tabs(dfs, sheets, './final/final_result.xlsx')
The code block below produces the this table:
Trial Week Branch Num_Dep Tot_dep_amt
1 1 1 4 4200
1 1 2 7 9000
1 1 3 6 4800
1 1 4 6 5800
1 1 5 5 3800
1 1 6 4 3200
1 1 7 3 1600
. . . . .
. . . . .
1 1 8 5 6000
9 19 40 3 2800
Code:
trials=10
dep_amount=[]
branch=41
total=[]
week=1
week_num=[]
branch_num=[]
dep_num=[]
trial_num=[]
weeks=20
df=pd.DataFrame()
for a in range(1,trials):
print("Starting trial", a)
for b in range(1,weeks):
for c in range(1,branch):
depnum = int(np.round(np.random.normal(5,2,1)/1)*1)
acc_dep=0
for d in range(1,depnum):
dep_amt=int(np.round(np.random.normal(1200,400,1)/200)*200)
acc_dep=acc_dep+dep_amt
temp = pd.DataFrame.from_records([{'Trial': a, 'Week': b, 'branch': c,'Num_Dep': depnum, 'Tot_dep_amt':acc_dep }])
df = pd.concat([df, temp])
df = df[['Trial', 'Week', 'branch', 'Num_Dep','Tot_dep_amt']]
df=df.reset_index()
df=df.drop('index',axis=1)
I would like to be able to break branches apart in the for-loop and instead have the resultant df represented with headers:
Trial Week Branch_1_Num_Dep Branch_1_Tot_dep_amount Branch_2_Num_ Dep .....etc
I know this could be done by generating the DF and performing an encoding, but for this task I would like it to be generated in the for loop if possible?
In order to achieve this with minimal changes to your code, you can do something like the following:
df = pd.DataFrame()
for a in range(1, trials):
print("Starting trial", a)
for b in range(1, weeks):
records = {'Trial': a, 'Week': b}
for c in range(1, branch):
depnum = int(np.round(np.random.normal(5, 2, 1) / 1) * 1)
acc_dep = 0
for d in range(1, depnum):
dep_amt = int(np.round(np.random.normal(1200, 400, 1) / 200) * 200)
acc_dep = acc_dep + dep_amt
records['Branch_{}_Num_Dep'.format(c)] = depnum
records['Branch_{}_Tot_dep_amount'.format(c)] = acc_dep
temp = pd.DataFrame.from_records([records])
df = pd.concat([df, temp])
df = df.reset_index()
df = df.drop('index', axis=1)
Overall it seems that what you are doing can be done in more elegant and faster ways. I would recommend taking a look to vectorization as a concept (e.g. here).
import pandas as pd
import glob
import csv
files=glob.glob('*.csv')
for file in files:
df=pd.read_csv(file, header= None)
output_file_name = "output_" + file
with open(output_file_name, 'w') as f:
f.write("sum of the 1. column is " + str(df.iloc[:, 0].sum())+"\n")
f.write("sum of the 2. column is " + str(df.iloc[:, 1].sum())+"\n")
f.write("sum of the 3. column is " + str(df.iloc[:, 2].sum())+"\n")
f.write("sum of the 4. column is " + str(df.iloc[:, 3].sum())+"\n")
f.write("max of the 1. column is " + str(df.iloc[:, 0].max()) + "\n")
f.write("max of the 2. column is " + str(df.iloc[:, 1].max()) + "\n")
f.write("max of the 3. column is " + str(df.iloc[:, 2].max()) + "\n")
f.write("max of the 4. column is " + str(df.iloc[:, 3].max()) + "\n")
f.close()
How can iterate trough my pandas files, so that i dont have to repeat all this lines again. I want the same output file with this information about max and sum.
For every csv file i want a new file in the same folder that describe the max, sum , stdn etc. for example the output file will be:
sum of the 1. column is 21
sum of the 2. column is 23
sum of the 3. column is 33
sum of the 4. column is 30
max of the 1. column is 6
max of the 2. column is 6
max of the 3. column is 8
max of the 4. column is 9
How can it make simpler :D :D
Tnx
You could use a double for-loop to iterate over all the functions and columns:
for funcname in ['sum', 'max', 'std']:
for i in range(len(df.columns)):
f.write("sum of the {} column is {}\n"
.format(i+1, getattr(df.iloc[:, 0], funcname)()))
getattr(df, 'sum') is equivalent to df.sum.
import pandas as pd
import glob
import csv
files = glob.glob('*.csv')
for file in files:
df = pd.read_csv(file, header=None)
output_file_name = "output_" + file
with open(output_file_name, 'w') as f:
# f.write("{}\n".format(df.describe()))
for funcname in ['sum', 'max', 'std']:
for i in range(len(df.columns)):
f.write("sum of the {} column is {}\n"
.format(i+1, getattr(df.iloc[:, 0], funcname)()))
Note that df.describe() presents summary statistics in a neat format. You might want to consider just printing df.describe():
In [26]: df = pd.DataFrame(np.random.random((10,6)))
In [27]: df
Out[27]:
0 1 2 3 4 5
0 0.791727 0.397873 0.924195 0.202464 0.789961 0.077095
1 0.920516 0.637618 0.383694 0.623393 0.328440 0.606576
2 0.844562 0.231242 0.183842 0.902065 0.286643 0.743508
3 0.411101 0.370284 0.249545 0.955745 0.561450 0.597586
4 0.185035 0.989508 0.522821 0.218888 0.569865 0.773848
5 0.196904 0.377201 0.816561 0.914657 0.482806 0.686805
6 0.809536 0.480733 0.397394 0.152101 0.645284 0.921204
7 0.004433 0.168943 0.865408 0.472513 0.188554 0.012219
8 0.534432 0.739246 0.628112 0.789579 0.268880 0.835339
9 0.701573 0.580974 0.858254 0.461687 0.493617 0.285601
In [28]: df.describe()
Out[28]:
0 1 2 3 4 5
count 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000
mean 0.539982 0.497362 0.582983 0.569309 0.461550 0.553978
std 0.324357 0.246491 0.274233 0.313254 0.189960 0.318598
min 0.004433 0.168943 0.183842 0.152101 0.188554 0.012219
25% 0.250453 0.372014 0.387119 0.279588 0.297092 0.363598
50% 0.618003 0.439303 0.575466 0.547953 0.488212 0.646691
75% 0.805084 0.623457 0.847830 0.873943 0.567761 0.766263
max 0.920516 0.989508 0.924195 0.955745 0.789961 0.921204
Use iloc for select first 4 columns, then apply function by agg, create columns starting with 1, reshape by stack, create list with list comprehension and last write to file by Series.to_csv:
files = glob.glob('*.csv')
for file in files:
df = pd.read_csv(file, header= None)
df1 = df.iloc[:, :4].agg(['sum','max','std'])
df1.columns = range(1, len(df1.columns) + 1)
s = df1.stack()
L = ['{} of the {}. column is {}'.format(a, b, c) for (a, b), c in s.items()]
output_file_name = "output_" + file
pd.Series(L).to_csv(output_file_name, index=False)
I have dataframe which I've referenced as df in the code and I'm applying aggregate functions on multiple columns of each group. I also applied user-defined lambda functions f4, f5, f6, f7. Some functions are very similar like f4, f6 and f7 where only parameter value are different. Can I pass these parameters from dictionary d, so that I have to write only one function instead of writing multiple functions?
f4 = lambda x: len(x[x>10]) # count the frequency of bearing greater than threshold value
f4.__name__ = 'Frequency'
f5 = lambda x: len(x[x<3.4]) # count the stop points with velocity less than threshold value 3.4
f5.__name__ = 'stop_frequency'
f6 = lambda x: len(x[x>0.2]) # count the points with velocity greater than threshold value 0.2
f6.__name__ = 'frequency'
f7 = lambda x: len(x[x>0.25]) # count the points with accelration greater than threshold value 0.25
f7.__name__ = 'frequency'
d = {'acceleration':['mean', 'median', 'min'],
'velocity':[f5, 'sum' ,'count', 'median', 'min'],
'velocity_rate':f6,
'acc_rate':f7,
'bearing':['sum', f4],
'bearing_rate':'sum',
'Vincenty_distance':'sum'}
df1 = df.groupby(['userid','trip_id','Transportation_Mode','segmentid'], sort=False).agg(d)
#flatenning MultiIndex in columns
df1.columns = df1.columns.map('_'.join)
#MultiIndex in index to columns
df1 = df1.reset_index(level=2, drop=False).reset_index()
I like to write a function like
f4(p) = lambda x: len(x[x>p])
f4.__name__ = 'Frequency'
d = {'acceleration':['mean', 'median', 'min'],
'velocity':[f5, 'sum' ,'count', 'median', 'min'],
'velocity_rate':f4(0.2),
'acc_rate':f4(0.25),
'bearing':['sum', f4(10)],
'bearing_rate':'sum',
'Vincenty_distance':'sum'}
The csv file of dataframe df is available at given link for more clarity of data.
https://drive.google.com/open?id=1R_BBL00G_Dlo-6yrovYJp5zEYLwlMPi9
It is possible, but not easy, solution by neilaronson.
Also solution is simplify by sum of True values of boolean mask.
def f4(p):
def ipf(x):
return (x < p).sum()
#your solution
#return len(x[x < p])
ipf.__name__ = 'Frequency'
return ipf
d = {'acceleration':['mean', 'median', 'min'],
'velocity':[f4(3.4), 'sum' ,'count', 'median', 'min'],
'velocity_rate':f4(0.2),
'acc_rate':f4(.25),
'bearing':['sum', f4(10)],
'bearing_rate':'sum',
'Vincenty_distance':'sum'}
df1 = df.groupby(['userid','trip_id','Transportation_Mode','segmentid'], sort=False).agg(d)
#flatenning MultiIndex in columns
df1.columns = df1.columns.map('_'.join)
#MultiIndex in index to columns
df1 = df1.reset_index(level=2, drop=False).reset_index()
EDIT: You can also pass parameter for greater or less:
def f4(p, op):
def ipf(x):
if op == 'greater':
return (x > p).sum()
elif op == 'less':
return (x < p).sum()
else:
raise ValueError("second argument has to be greater or less only")
ipf.__name__ = 'Frequency'
return ipf
d = {'acceleration':['mean', 'median', 'min'],
'velocity':[f4(3.4, 'less'), 'sum' ,'count', 'median', 'min'],
'velocity_rate':f4(0.2, 'greater'),
'acc_rate':f4(.25, 'greater'),
'bearing':['sum', f4(10, 'greater')],
'bearing_rate':'sum',
'Vincenty_distance':'sum'}
df1 = df.groupby(['userid','trip_id','Transportation_Mode','segmentid'], sort=False).agg(d)
#flatenning MultiIndex in columns
df1.columns = df1.columns.map('_'.join)
#MultiIndex in index to columns
df1 = df1.reset_index(level=2, drop=False).reset_index()
print (df1.head())
userid trip_id segmentid Transportation_Mode acceleration_mean \
0 141 1.0 1 walk 0.061083
1 141 2.0 1 walk 0.109148
2 141 3.0 1 walk 0.106771
3 141 4.0 1 walk 0.141180
4 141 5.0 1 walk 1.147157
acceleration_median acceleration_min velocity_Frequency velocity_sum \
0 -1.168583e-02 -2.994428 1000.0 1506.679506
1 1.665535e-09 -3.234188 464.0 712.429005
2 -3.055414e-08 -3.131293 996.0 1394.746071
3 9.241707e-09 -3.307262 340.0 513.461259
4 -2.609489e-02 -3.190424 493.0 729.702854
velocity_count velocity_median velocity_min velocity_rate_Frequency \
0 1028 1.294657 0.284747 288.0
1 486 1.189650 0.284725 134.0
2 1020 1.241419 0.284733 301.0
3 352 1.326324 0.339590 93.0
4 504 1.247868 0.284740 168.0
acc_rate_Frequency bearing_sum bearing_Frequency bearing_rate_sum \
0 169.0 81604.187066 884.0 -371.276356
1 89.0 25559.589869 313.0 -357.869944
2 203.0 -71540.141199 57.0 946.382581
3 78.0 9548.920765 167.0 -943.184805
4 93.0 -24021.555784 67.0 535.333624
Vincenty_distance_sum
0 1506.679506
1 712.429005
2 1395.328768
3 513.461259
4 731.823664
What I am trying to do is to get bootstrap confidence limits by row regardless of the number of rows and make a new dataframe from the output.I currently can do this for the entire dataframe, but not by row. The data I have in my actual program looks similar to what I have below:
0 1 2
0 1 2 3
1 4 1 4
2 1 2 3
3 4 1 4
I want the new dataframe to look something like this with the lower and upper confidence limits:
0 1
0 1 2
1 1 5.5
2 1 4.5
3 1 4.2
The current generated output looks like this:
0 1
0 2.0 2.75
The python 3 code below generates a mock dataframe and generates the bootstrap confidence limits for the entire dataframe. The result is a new dataframe with just 2 values, a upper and a lower confidence limit rather than 4 sets of 2(one for each row).
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a)
b = pd.DataFrame(b)
b = b.T
print(b)
Thank you for any help.
scikits.bootstrap operates by assuming that data samples are arranged by row, not by column. If you want the opposite behavior, just use the transpose, and a statfunction that doesn't combine columns.
import pandas as pd
import numpy as np
import scikits.bootstrap as sci
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
print(zz)
x= zz.dtypes
print(x)
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
b = sci.ci(a.T, statfunction=lambda x: np.average(x, axis=0))
print(b.T)
Below is the answer I ended up figuring out to create bootstrap ci by row.
import pandas as pd
import numpy as np
import numpy.random as npr
zz = pd.DataFrame([[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]],
[[1,2],[2,3],[3,6]],[[4,2],[1,4],[4,6]]])
x= zz.dtypes
a = pd.DataFrame(np.array(zz.values.tolist())[:, :, 0],zz.index, zz.columns)
print(a)
def bootstrap(data, num_samples, statistic, alpha):
n = len(data)
idx = npr.randint(0, n, (num_samples, n))
samples = data[idx]
stat = np.sort(statistic(samples, 1))
return (stat[int((alpha/2.0)*num_samples)],
stat[int((1-alpha/2.0)*num_samples)])
cc = list(a.index.values) # informs generator of the number of rows
def bootbyrow(cc):
for xx in range(1):
xx = list(a.index.values)
for xx in range(len(cc)):
k = a.apply(lambda y: y[xx])
k = k.values
for xx in range(1):
kk = list(bootstrap(k,10000,np.mean,0.05))
yield list(kk)
abc = pd.DataFrame(list(bootbyrow(cc))) #bootstrap ci by row
# the next 4 just show that its working correctly
a0 = bootstrap((a.loc[0,].values),10000,np.mean,0.05)
a1 = bootstrap((a.loc[1,].values),10000,np.mean,0.05)
a2 = bootstrap((a.loc[2,].values),10000,np.mean,0.05)
a3 = bootstrap((a.loc[3,].values),10000,np.mean,0.05)
print(abc)
print(a0)
print(a1)
print(a2)
print(a3)