df.to_excel capture only last request of iteration pandas - python-3.x

I am currently trying to iterate through a large dataset and using to.excel() to write in my dataframe to Excel.
My code:
writer = pd.ExcelWriter(r'report.xlsx')
for x in range(3):
slq = ("select date_added, fruit_id from market")
data = pd.read_sql(sql, c)
df = pd.DataFrame(data)
df.to_excel(writer)
writer.save()
When this is run, I am only capturing the 3rd request in my range. Is there a different method that would allow me to capture all 3 requests in my range?

There does not appear to be a ExcelWriter.append method. Instead, make a list of the dataframes and pd.concat at the end.
writer = pd.ExcelWriter(r'report.xlsx')
dfs = []
for x in range(3):
sql = ("select date_added, fruit_id from market")
data = pd.read_sql(sql, c)
df = pd.DataFrame(data)
dfs.append(df)
df = pd.concat(dfs)
df.to_excel(writer)
writer.save()
Alternatively, pd.DataFrame.to_excel does have a startrow argument that could be used to append.
writer = pd.ExcelWriter(r'report.xlsx')
row = 0
for x in range(3):
sql = ("select date_added, fruit_id from market")
data = pd.read_sql(sql, c)
df = pd.DataFrame(data)
df.to_excel(writer, startrow=row)
row += len(df)
writer.save()

Related

Passing DF from function

i wrote a function which build a df inside it and i want to use it afterwards outside the function or in another function, how can i do it witout facing any recognition problem?
Thankw's a lot :)
The code:
def DisplayDataFrame():
file_path = filedialog.askopenfilename()
df1 = pd.read_excel(file_path)
cols = list(df1.columns)
tree = ttk.Treeview(root)
tree.pack()
tree["columns"] = cols
for i in cols:
tree.column(i, anchor="w")
tree.heading(i, text=i, anchor='w')
for index, row in df1.iterrows():
tree.insert("", 0, text=index, values=list(row))
option = df1.index()
Do you mean use df1 from your DisplayDataFrame() in other functions? If so, you can have return df1 in your function like this:
def DisplayDataFrame():
'''
your original codes to define df1
'''
return df1
dataframe = DisplayDataFrame()
Then you can reuse the dataframe in other functions.

How to apply a function fastly on the list of DataFrame in Python?

I have a list of DataFrames with equal length of columns and rows but different values, such as
data = [df1, df2,df3.... dfn] .
How can I apply a function function on each dataframe in the list data? I used following code but it doe not work
data = [df1, def2,df3.... dfn]
def maxloc(data):
data['loc_max'] = np.zeros(len(data))
for i in range(1,len(data)-1): #from the second value on
if data['q_value'][i] >= data['q_value'][i-1] and data['q_value'][i] >= data['q_value'][i+1]:
data['loc_max'][i] = 1
return data
df_list = [df.pipe(maxloc) for df in data]
Seems to me the problem is in your maxloc() function as this code works.
I added also the maximum value in the return of maxloc.
from random import randrange
import pandas as pd
def maxloc(data_frame):
max_index = data_frame['Value'].idxmax(0)
maximum = data_frame['Value'][max_index]
return max_index, maximum
# create test list of data-frames
data = []
for i in range(5):
temp = []
for j in range(10):
temp.append(randrange(100))
df = pd.DataFrame({'Value': temp}, index=(range(10)))
data.append(df)
df_list = [df.pipe(maxloc) for df in data]
for i, (index, value) in enumerate(df_list):
print(f"Data-frame {i:02d}: maximum = {value} at position {index}")

How to efficiently write pandas melt and join in order to run inside containers without causing SystemOOM exceptions?

I have a code which is running inside a docker container, on kubernetes cluster. The issue is that regardless of the memory limit set, sometimes, the code just fails, i.e. airflow task running the code gets failed.
I tried checking in on memory and CPU utilization. Both of them are under limits and pod never restarts. Note: my pod has only one container running and that is airflow's worker. It should be noted that CPU utilization reaches "tending to 1" and the task fails.
Since the dataset is transformed correctly, I am not posting the sample data. I am looking for efficiency in terms of resources.
My code is:
# Below code is required at the start of each script
from athena.api import get_pandas_df
import pandas as pd
last_correct_constant = 11
def remove_unwanted_cols(df, col_name):
unwanted_cols = []
for _col in df.columns:
if _col.startswith("unnamed"):
if int(_col.split("unnamed__")[-1]) > last_correct_constant:
unwanted_cols.append(_col)
else:
if not _col.startswith(col_name):
unwanted_cols.append(_col)
df = df.drop(columns=unwanted_cols)
return df
def sanitize_column_names(df):
corrected_columns = []
for column in df.columns:
corrected_columns.append(
column
.replace("(", "_")
.replace(")", "_")
.replace(" ", "_")
.replace("/", "_")
.replace(".", "_")
.replace(":", "_")
.lower())
df.columns = corrected_columns
return df
def get_first_row_as_header(df):
df.columns = df.iloc[0]
print("Columns are: ")
print(df.columns)
print("Head is: ")
df = df.iloc[1:]
print(df.head(1))
return df
def remove_cols_for_join(df, col_name):
unwanted_cols = []
for _col in df.columns:
if _col != 'period' and (not _col.startswith(col_name)) and _col != 'Markets':
unwanted_cols.append(_col)
print("Unwanted cols are: ")
print(unwanted_cols)
df = df.drop(columns=unwanted_cols)
return df
def main(*args, **kwargs):
""" Put your main logic here.
Help:
To get pandas dataframe of upstream nodes.
data_frame = get_pandas_df("<upstream_node_name>")
Example: data_frame = get_pandas_df("S3")
Return:
output_data_frame {pandas.DataFrame}
This data frame will be transferred to downstream nodes.
"""
# read dataframes
df = get_pandas_df("CSV")
df = sanitize_column_names(df)
df_sales = df
df_gr_vol = df
df_gr_val = df
print("remove unwanted cols for individual melts")
df = remove_unwanted_cols(df, 'value_offtake_000_rs__')
df_sales = remove_unwanted_cols(df_sales, 'sales_volume__volume_litres__')
df_gr_vol = remove_unwanted_cols(df_gr_vol, 'gr_vol_ya')
df_gr_val = remove_unwanted_cols(df_gr_val, 'gr_val_ya')
df = get_first_row_as_header(df)
df_sales = get_first_row_as_header(df_sales)
df_gr_vol = get_first_row_as_header(df_gr_vol)
df_gr_val = get_first_row_as_header(df_gr_val)
print("melting dataframes")
table_columns = df.columns
df = pd.melt(
df, id_vars=table_columns[:last_correct_constant+1],
value_vars=table_columns[last_correct_constant+1:], var_name='period',
value_name='value_offtake_000_rs__')
df = df[(df["Markets"] != '')]
table_columns = df_sales.columns
df_sales = pd.melt(
df_sales, id_vars=table_columns[:last_correct_constant+1],
value_vars=table_columns[last_correct_constant+1:], var_name='period',
value_name='sales_volume__volume_litres__')
df_sales = df_sales[(df_sales["Markets"] != '')]
df_sales = remove_cols_for_join(df_sales, 'sales_volume__volume_litres__')
table_columns = df_gr_vol.columns
df_gr_vol = pd.melt(
df_gr_vol, id_vars=table_columns[:last_correct_constant+1],
value_vars=table_columns[last_correct_constant+1:], var_name='period',
value_name='gr_vol_ya')
df_gr_vol = df_gr_vol[(df_gr_vol["Markets"] != '')]
df_gr_vol = remove_cols_for_join(df_gr_vol, 'gr_vol_ya')
table_columns = df_gr_val.columns
df_gr_val = pd.melt(
df_gr_val, id_vars=table_columns[:last_correct_constant+1],
value_vars=table_columns[last_correct_constant+1:], var_name='period',
value_name='gr_val_ya')
df_gr_val = df_gr_val[(df_gr_val["Markets"] != '')]
df_gr_val = remove_cols_for_join(df_gr_val, 'gr_val_ya')
print("Before merge: ")
for _col in df.columns :
print(_col)
print("==================")
for _col in df_sales.columns :
print(_col)
df = pd.merge(df, df_sales, on=['Markets', 'period'])
df = pd.merge(df, df_gr_val, on=['Markets', 'period'])
df = pd.merge(df, df_gr_vol, on=['Markets', 'period'])
df = sanitize_column_names(df)
return df
I expect this code to run with efficiently using memory and cpu. My current memory is set for 32GB and 10CPU cores.
The data contains 14 rows and 637 columns which I transform in an aforementioned way.

looping many excels into seperate dataframes

I am relatively new to Python 3 and i need help on looping in 4 excel spreedsheets as seperate df. i can do this by manually typing in pd.read_excel(filepath, index_col=0) for each of the 4 filepaths but i was looking for a robust way to simply loop in all my filepaths that i have and create a df per filepath (excel spreedsheet). can anyone help me? thanks
filepath = '/Users/User/Desktop/Tax\Year.xlsx'
filepath2 = '/Users/User/Desktop/Tax\Year2.xlsx'
filepath3 = '/Users/User/Desktop/Tax\Year3.xlsx'
filepath4 = '/Users/User/Desktop/Tax\Year4.xlsx'
df = pd.read_excel(filepath, index_col=0)
df2 = pd.read_excel(filepath2, index_col=0)
df3 = pd.read_excel(filepath3, index_col=0)
df4 = pd.read_excel(filepath4, index_col=0)
I would put a '1' on the first filepath.
dict_of_dfs = {}
for n in range(1,5):
filepath = '/Users/User/Desktop/Tax/Year' + str(n) + '.xlsx'
df = pd.read_excel(filepath, index_col=0)
dict_of_dfs[n] = df
# retrieve your dfs...
df1 = dict_of_dfs[1]
df2 = dict_of_dfs[2]
# etc...
Further to OPs question below. Use walk from os library to be able to load filenames from a directory and then walk through them in a list
from os import walk
directory = '/Users/User/Desktop/Tax/Year'
f = []
for (dirpath, dirnames, filenames) in walk(directory):
f.extend(filenames)
break
Then you can access your filenames through a for loop:
for n in f:
filepath = '/Users/User/Desktop/Tax/Year' + n
df = pd.read_excel(filepath, index_col=0)
dict_of_dfs[n] = df

Saving loop output to multiple excel sheets

I have a csv file full of multiple years of water data. I've broken up each water year into it's own data frame. Now I want to do some math to those water years then save each water year to it's own excel sheet.
The math part of the code is working, but I'm having trouble with the final step of naming and saving the output of the loop correctly. Right now I have it creating the excel file and creating the sheet names correctly, but the loop just saves the final iteration to all the sheets. I've googled around but I can't get any other of the similar questions answers to work. This is my first python program so advice would be appreciated.
import pandas as pd
with open(r'wft.csv') as csvfile:
tdata = pd.read_csv(csvfile)
tdata['date'] = pd.to_datetime(tdata['date'], format='%m/%d/%Y %H:%M')
tdata = tdata.set_index(['date'])
wy2015 = tdata.loc['2014-10-1 00:00' : '2015-7-1 00:00']
wy2016 = tdata.loc['2015-10-1 00:00' : '2016-7-1 00:00']
wy2017 = tdata.loc['2016-10-1 00:00' : '2017-7-1 00:00']
writer = pd.ExcelWriter('WFT.xlsx', engine='xlsxwriter')
wyID = [wy2014, wy2015, wy2016, wy2017]
seq = ['wy2014', 'wy2015', 'wy2016', 'wy2017']
for df in wyID:
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
for name in seq:
df.to_excel(writer, sheet_name= name)
writer.save()
Issues in your code
writer = pd.ExcelWriter('WFT.xlsx', engine='xlsxwriter')
wyID = [wy2014, wy2015, wy2016, wy2017]
seq = ['wy2014', 'wy2015', 'wy2016', 'wy2017']
for df in wyID: # outer loop that figures out wy20xx
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
for name in seq: # you loop through all the names and write all sheets every time. you want to be writing just one
df.to_excel(writer, sheet_name= name)
writer.save()
Instead try this.
for i, df in enumerate(wyID): # outer loop that figures out wy20xx
df = df.sort_values(by=['turbidity'], ascending=False)
df['rank'] = df['turbidity'].rank(method = 'first', ascending=0)
df['cunnanes'] = (df['rank'] - 0.4)/(len(df['rank']) + 0.2)*100
df.to_excel(writer, sheet_name= seq[i]) # writes to correct wy20xx sheet
writer.save() # Now you're done writing the excel

Resources