import pandas as pd
import glob
import csv
files=glob.glob('*.csv')
for file in files:
df=pd.read_csv(file, header= None)
output_file_name = "output_" + file
with open(output_file_name, 'w') as f:
f.write("sum of the 1. column is " + str(df.iloc[:, 0].sum())+"\n")
f.write("sum of the 2. column is " + str(df.iloc[:, 1].sum())+"\n")
f.write("sum of the 3. column is " + str(df.iloc[:, 2].sum())+"\n")
f.write("sum of the 4. column is " + str(df.iloc[:, 3].sum())+"\n")
f.write("max of the 1. column is " + str(df.iloc[:, 0].max()) + "\n")
f.write("max of the 2. column is " + str(df.iloc[:, 1].max()) + "\n")
f.write("max of the 3. column is " + str(df.iloc[:, 2].max()) + "\n")
f.write("max of the 4. column is " + str(df.iloc[:, 3].max()) + "\n")
f.close()
How can iterate trough my pandas files, so that i dont have to repeat all this lines again. I want the same output file with this information about max and sum.
For every csv file i want a new file in the same folder that describe the max, sum , stdn etc. for example the output file will be:
sum of the 1. column is 21
sum of the 2. column is 23
sum of the 3. column is 33
sum of the 4. column is 30
max of the 1. column is 6
max of the 2. column is 6
max of the 3. column is 8
max of the 4. column is 9
How can it make simpler :D :D
Tnx
You could use a double for-loop to iterate over all the functions and columns:
for funcname in ['sum', 'max', 'std']:
for i in range(len(df.columns)):
f.write("sum of the {} column is {}\n"
.format(i+1, getattr(df.iloc[:, 0], funcname)()))
getattr(df, 'sum') is equivalent to df.sum.
import pandas as pd
import glob
import csv
files = glob.glob('*.csv')
for file in files:
df = pd.read_csv(file, header=None)
output_file_name = "output_" + file
with open(output_file_name, 'w') as f:
# f.write("{}\n".format(df.describe()))
for funcname in ['sum', 'max', 'std']:
for i in range(len(df.columns)):
f.write("sum of the {} column is {}\n"
.format(i+1, getattr(df.iloc[:, 0], funcname)()))
Note that df.describe() presents summary statistics in a neat format. You might want to consider just printing df.describe():
In [26]: df = pd.DataFrame(np.random.random((10,6)))
In [27]: df
Out[27]:
0 1 2 3 4 5
0 0.791727 0.397873 0.924195 0.202464 0.789961 0.077095
1 0.920516 0.637618 0.383694 0.623393 0.328440 0.606576
2 0.844562 0.231242 0.183842 0.902065 0.286643 0.743508
3 0.411101 0.370284 0.249545 0.955745 0.561450 0.597586
4 0.185035 0.989508 0.522821 0.218888 0.569865 0.773848
5 0.196904 0.377201 0.816561 0.914657 0.482806 0.686805
6 0.809536 0.480733 0.397394 0.152101 0.645284 0.921204
7 0.004433 0.168943 0.865408 0.472513 0.188554 0.012219
8 0.534432 0.739246 0.628112 0.789579 0.268880 0.835339
9 0.701573 0.580974 0.858254 0.461687 0.493617 0.285601
In [28]: df.describe()
Out[28]:
0 1 2 3 4 5
count 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000
mean 0.539982 0.497362 0.582983 0.569309 0.461550 0.553978
std 0.324357 0.246491 0.274233 0.313254 0.189960 0.318598
min 0.004433 0.168943 0.183842 0.152101 0.188554 0.012219
25% 0.250453 0.372014 0.387119 0.279588 0.297092 0.363598
50% 0.618003 0.439303 0.575466 0.547953 0.488212 0.646691
75% 0.805084 0.623457 0.847830 0.873943 0.567761 0.766263
max 0.920516 0.989508 0.924195 0.955745 0.789961 0.921204
Use iloc for select first 4 columns, then apply function by agg, create columns starting with 1, reshape by stack, create list with list comprehension and last write to file by Series.to_csv:
files = glob.glob('*.csv')
for file in files:
df = pd.read_csv(file, header= None)
df1 = df.iloc[:, :4].agg(['sum','max','std'])
df1.columns = range(1, len(df1.columns) + 1)
s = df1.stack()
L = ['{} of the {}. column is {}'.format(a, b, c) for (a, b), c in s.items()]
output_file_name = "output_" + file
pd.Series(L).to_csv(output_file_name, index=False)
Related
I am trying to figure out how to make my pandas DataFrame group data together. Currently, if you input data, for example: 1, 1, 2, 2, 3 - it will come out like this:
column1 column2
1: 2
2: 2
3: 1
I would like it to just group the data if they have the same value in column2 and for this example, just show 2 : 2, meaning 2 of the numbers inserted were both inserted twice, so we will just count that. Here is my current code:
from collections import Counter
import time
filename = input("Enter name to save this file as:\n\n")
print("Your file will be saved as: " + filename + ".csv\n\n")
time.sleep(0.5)
print("Please enter information")
contents = []
def amount():
while True:
try:
line = input()
except EOFError:
break
contents.append(line)
return
amount()
count = Counter(contents)
print(count)
import pandas as pd
d = count
df = pd.DataFrame.from_dict(d, orient='index').reset_index()
df.columns =['column1', 'column2']
df.sort_values(by=['column2'].value_counts())
df.to_csv(filename + ".csv", encoding='utf-8', index=False)
Any help would be appreciated.
I have tried in this using value_counts() in Pandas, not sure if it'll work for you!
values = [1, 1, 2, 2, 3]
df = pd.DataFrame(values, columns=['values'])
res = df['values'].value_counts().to_frame().reset_index().sort_values('index')
# renaming the columns
res.columns = ['Values', 'Count']
display(res)
i want replace the append() with the concat() from pandas. But when i try to replace my output is different. Thank you
old with append():
def gettrigger(self):
dfx = pd.DataFrame()
for i in range(self.lags +1):
mask = (self.df["%K"].shift(i) < 20) & (self.df["%D"].shift(i) < 20)
dfx = dfx.append(mask, ignore_index=True)
return dfx.sum(axis=0)
output with append()
new with pd.concat():
def gettrigger(self):
dfx = pd.DataFrame()
for i in range(self.lags +1):
mask = (self.df["%K"].shift(i) < 20) & (self.df["%D"].shift(i) < 20)
#dfx = dfx.append(mask, ignore_index=True)
dfx = pd.concat([dfx, mask], ignore_index=True)
return dfx.sum(axis=0)
output with pd.concat()
While the append method is appending data as rows to your DataFrame, the concat method is appending data as columns
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(10,4))
df
0 1 2 3
0 0.403637 -0.204563 -2.070799 0.759681
1 -0.684890 -0.651969 1.028248 0.129635
2 -1.011895 1.877984 -0.724938 0.869389
3 -1.344031 0.106898 1.762096 0.088816
4 -0.382107 1.822279 0.435752 -2.573199
5 -1.173345 -1.224242 0.887549 -0.816519
6 -1.269713 0.201384 1.576388 -1.355996
7 -0.459533 0.961355 -0.280733 -0.026496
8 0.522513 0.266246 1.066807 -0.232884
9 -0.688417 0.181908 0.356574 -0.245040
dfx = pd.DataFrame()
for i in range(df.shape[1]): # Iterating through columns
df_new = df.loc[:, i] # loc column
dfx = dfx.append(df_new) # append column as new row
dfx
0 1 2 3 4 5 6 7 8 9
0 0.403637 -0.684890 -1.011895 -1.344031 -0.382107 -1.173345 -1.269713 -0.459533 0.522513 -0.688417
1 -0.204563 -0.651969 1.877984 0.106898 1.822279 -1.224242 0.201384 0.961355 0.266246 0.181908
2 -2.070799 1.028248 -0.724938 1.762096 0.435752 0.887549 1.576388 -0.280733 1.066807 0.356574
3 0.759681 0.129635 0.869389 0.088816 -2.573199 -0.816519 -1.355996 -0.026496 -0.232884 -0.245040
dfx = pd.DataFrame()
for i in range(df.shape[0]): # Iterating through rows
df_new = df.loc[i, :] # loc row
dfx = pd.concat([dfx, df_new], axis=1) # append row as column (axis=1)
dfx
0 1 2 3 4 5 6 7 8 9
0 0.403637 -0.684890 -1.011895 -1.344031 -0.382107 -1.173345 -1.269713 -0.459533 0.522513 -0.688417
1 -0.204563 -0.651969 1.877984 0.106898 1.822279 -1.224242 0.201384 0.961355 0.266246 0.181908
2 -2.070799 1.028248 -0.724938 1.762096 0.435752 0.887549 1.576388 -0.280733 1.066807 0.356574
3 0.759681 0.129635 0.869389 0.088816 -2.573199 -0.816519 -1.355996 -0.026496 -0.232884 -0.245040
Let's say we have many excel files with the multiple sheets as follows:
Sheet 1: 2021_q1_bj
a b c d
0 1 2 23 2
1 2 3 45 5
Sheet 2: 2021_q2_bj
a b c d
0 1 2 23 6
1 2 3 45 7
Sheet 3: 2019_q1_sh
a b c
0 1 2 23
1 2 3 45
Sheet 4: 2019_q2_sh
a b c
0 1 2 23
1 2 3 40
I wish to append all the sheets to one if the last part split by _ of sheet names are same across all excel files. ie., sheet 1 will append with sheet 2 since their both have common bj, if another excel file also have sheets with name bj, it will also be append to this one, same logic for sheet 3 and sheet 4.
How could I achieve that in Pandas or other Python packages?
The expected result for current excel file would be:
bj:
a b c d
0 1 2 23 2
1 2 3 45 5
2 1 2 23 6
3 2 3 45 7
sh:
a b c
0 1 2 23
1 2 3 45
2 1 2 23
3 2 3 40
Code for reference:
import os, glob
import pandas as pd
files = glob.glob("*.xlsx")
for each in files:
dfs = pd.read_excel(each, sheet_name=None, index_col=[0])
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'Out_{n}.xlsx', index=False)
Update:
You may download test excel files and final expected result from this link.
Try:
dfs = pd.read_excel('Downloads/WS_1.xlsx', sheet_name=None, index_col=[0])
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'Out_{n}.xlsx')
Update
import os, glob
import pandas as pd
files = glob.glob("Downloads/test_data/*.xlsx")
writer = pd.ExcelWriter('Downloads/test_data/Output_file.xlsx', engine='xlsxwriter')
excel_dict = {}
for each in files:
dfs = pd.read_excel(each, sheet_name=None, index_col=[0])
excel_dict.update(dfs)
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(writer, index=False, sheet_name=f'{n}')
writer.save()
writer.close()
I have achieved the whole process and get the final expected result with the code below.
Thanks to provide alternative and more concise solutions or give me some advices if it's possible:
import os, glob
import pandas as pd
from pandas import ExcelWriter
from datetime import datetime
def save_xls(dict_df, path):
writer = ExcelWriter(path)
for key in dict_df:
dict_df[key].to_excel(writer, key, index=False)
writer.save()
root_dir = './original/'
for root, subFolders, files in os.walk(root_dir):
# print(subFolders)
for file in files:
if '.xlsx' in file:
file_path = os.path.join(root_dir, file)
print(file)
f = pd.ExcelFile(file_path)
dict_dfs = {}
for sheet_name in f.sheet_names:
df_new = f.parse(sheet_name = sheet_name)
print(sheet_name)
## get the year and quarter from the sheet name
year, quarter, city = sheet_name.split("_")
# year, quarter, city = sheet_name.split("_")
df_new["year"] = year
df_new["quarter"] = quarter
df_new["city"] = city
dict_dfs[sheet_name] = df_new
save_xls(dict_df = dict_dfs, path = './add_columns_from_sheet_name/' + "new_" + file)
root_dir = './add_columns_from_sheet_name/'
list1 = []
df = pd.DataFrame()
for root, subFolders, files in os.walk(root_dir):
# print(subFolders)
for file in files:
if '.xlsx' in file:
# print(file)
city = file.split('_')[0]
# print(file)
file_path = os.path.join(root_dir, file)
# print(file_path)
dfs = pd.read_excel(file_path, sheet_name=None)
df_out = pd.concat(dfs.values(), keys=dfs.keys())
for n, g in df_out.groupby(df_out.index.to_series().str[0].str.rsplit('_', n=1).str[-1]):
print(n)
timestr = datetime.utcnow().strftime('%Y%m%d-%H%M%S%f')[:-3]
g.droplevel(level=0).dropna(how='all', axis=1).reset_index(drop=True).to_excel(f'./output/{n}_{timestr}.xlsx', index=False)
file_set = set()
file_dir = './output/'
file_list = os.listdir(file_dir)
for file in file_list:
data_type = file.split('_')[0]
file_set.add(data_type)
print(file_set)
file_dir = './output'
file_list = os.listdir(file_dir)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
file_set = set()
for file in file_list:
if '.xlsx' in file:
# print(file)
df_temp = pd.read_excel(os.path.join(file_dir, file))
if 'bj' in file:
df1 = df1.append(df_temp)
elif 'sh' in file:
df2 = df2.append(df_temp)
elif 'gz' in file:
df3 = df3.append(df_temp)
elif 'sz' in file:
df4 = df4.append(df_temp)
# function
def dfs_tabs(df_list, sheet_list, file_name):
writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
for dataframe, sheet in zip(df_list, sheet_list):
dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0, index=False)
writer.save()
# list of dataframes and sheet names
dfs = [df1, df2, df3, df4]
sheets = ['bj', 'sh', 'gz', 'sz']
# run function
dfs_tabs(dfs, sheets, './final/final_result.xlsx')
I have this dataframe.
df = pd.DataFrame({'date': np.array(['2021-04-11', '2021-04-12', '2021-04-13', '2021-04-14',
'2021-04-15', '2021-04-16', '2021-04-17', '2021-04-18',
'2021-04-19', '2021-04-20', '2021-04-21', '2021-04-22',
'2021-04-23', '2021-04-24', '2021-04-25', '2021-04-26',
'2021-04-27', '2021-04-28', '2021-04-29', '2021-04-30',
'2021-05-01' ,'2021-05-02', '2021-05-03', '2021-05-04',
'2021-05-05', '2021-05-06', '2021-05-07']),
'value': np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,
24,25,26,27])})
I want to split it to 3 parts (train,val and test).
For example:
split=0.7 # split perc
last=7 # keep 7 last days for test data
split_idx = int(df.shape[0] * split)
train_df = df[:split_idx]
val_df = df[split_idx:-last]
test_df = df[(train_df.shape[0] + val_df.shape[0]):]
So, now I have:
len(train_df), len(val_df), len(test_df) = 18, 2, 7
I want the lengths to be divisible by 7, so:
if len(train_df) % 7 != 0:
# move those rows to the beginning of val_df
val_df.loc[0] =
# drop those rows from train_df
train_df.drop(train_df.tail(len(train_df) % 7).index, inplace=True)
If the len of train_df is not divisible by 7, then I want to move those last rows of data to the beginning of val_df data and then drop those from train_df. The same applies to val_df. The test_df will always have at lest 7 values, so if it greater I will just drop them.
So , I found an answer!
if len(train_df) % 7 != 0:
# move those rows to the beginning of val_df
rows = train_df.tail(len(train_df) % 7)
val_df = pd.concat([val_df.iloc[:len(val_df)], rows, val_df.iloc[len(val_df):]]).sort_index()
# drop those rows from train_df
train_df.drop(train_df.tail(len(train_df) % 7).index[0:], inplace=True)
I have a dataframe like this:
Image Id
0 1a.jpg w_1
1 c4.jpg w_1
2 b01.jpg w_2
3 d5.jpg w_1
4 df.jpg w_f
5 c2.jpg w_2
6 ab.jpg w_3e
What is the pandas way to return this output?
output:(1a.jpg,c4.jpg,d5.jpg)(b01.jpg,c2.jpg)(df.jpg)(ab.jpg)
Use groupby and convert values to tuples first and then to list:
L = df.groupby('Id', sort=False)['Image'].apply(tuple).tolist()
print (L)
[('1a.jpg', 'c4.jpg', 'd5.jpg'), ('b01.jpg', 'c2.jpg'), ('df.jpg',), ('ab.jpg',)]
Similar for convert to lists instead tuples:
L1 = df.groupby('Id', sort=False)['Image'].apply(list).tolist()
print (L1)
[['1a.jpg', 'c4.jpg', 'd5.jpg'], ['b01.jpg', 'c2.jpg'], ['df.jpg'], ['ab.jpg']]
And if need strings:
s = ''.join('(' + df.groupby('Id', sort=False)['Image'].apply(', '.join) +')')
print (s)
(1a.jpg, c4.jpg, d5.jpg)(b01.jpg, c2.jpg)(df.jpg)(ab.jpg)