Variables assignement before function - python-3.x

I have created a package to quickly transform datas using pandas and xlsxwriter.
This worked pretty well and I did a few functions successfully. But recently I've hit a wall:
For a few functions I need to define variables first but they are not basic types (list, tuple, str etc.) but for instance a dataframe. I've looked into global variables and saw they're are not recommanded (and wouldn't know where to put them) and I also looked into classes but I don't know how to solve my problem using them. I've also tried creating an empty dataframe but got an empty dataframe after the function.
What I'm trying to do is a read function with pandas for .csv or .xlsx and a function for saving with Xlsxwriter engine.
The goal is to change as little as possible in the code to transform data frequently and rapidly (e.g. i have functions doing LEFT,RIGHT like in Excel or even MIDDLE with column numbers) and have an easy and short code in main.py.
Here is the stripped down version of my code which uses 2 python files (main.py and format_operations.py). I have added commentaries where I'm having issues.
Thanks in advance for your help!
"""
main.py
"""
import format_operations as tbfrm #import another python file in the same folder
import pandas as pd
import numpy as np
import xlsxwriter.utility
#file settings
file_full_path= "C:/Tests/big_data.xlsx"
file_save_to= "C:/Tests/Xlsxwriter.xlsx"
sheet_name_save_to= "Xlswriter"
dfname = ??? #I need to create the variable but I don't know how
tbfrm.FCT_universal_read(dfname,file_full_path) #CAN'T GET IT TO WORK
#column operations and formatting
columns_numeric = [3,6] # (with pandas) list of columns with number values by iloc number, starts at 0 which is column A in Excel
tbfrm.FCT_columns_numeric(dfname,columns_numeric) #example of a WORKING function (if dfname is defined)
#write with Xlsxwriter engine
XLWRITER_DF = ??? #same problem as before, how to create the variable?
workbookvarname = ??? #same here
worksheetvarname = ??? # same here
tbfrm.FCT_df_xlsxwriter(XLWRITER_DF,dfname,file_save_to,sheet_name_save_to,workbookvarname,worksheetvarname) #CAN'T GET IT TO WORK
#### WORKING piece of code I want to execute after saving with Xlsxwriter engine ####
worksheet.set_zoom(80)
# Conditional formatting
color_range_1 = "J1:J{}".format(number_rows+1)
FORMAT1 = workbook.add_format({'bg_color': '#FFC7CE','font_color': '#9C0006'})
FORMAT2 = workbook.add_format({'bg_color': '#C6EFCE','font_color': '#006100'})
worksheet.conditional_format(color_range_1, {'type': 'bottom','value': '5','format': FORMAT1})
worksheet.conditional_format(color_range_1, {'type': 'top','value': '5','format': FORMAT2})
Other file:
"""
format_operations.py
"""
import pandas as pd
import numpy as np
import xlsxwriter.utility
def FCT_universal_read(dfname,file_full_path):
if ".xls" in file_full_path:
dfname = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
if ".csv" in file_full_path:
dfname = pd.read_csv(file_full_path)
# save file with XLSXWriter engine for additional options to pandas
def FCT_df_xlsxwriter(XLWRITER_DF,dfname,file_save_to,sheet_name_save_to,workbookvarname,worksheetvarname):
XLWRITER_DF = pd.ExcelWriter(file_save_to, engine='xlsxwriter')
dfname.to_excel(XLWRITER_DF, sheet_name=sheet_name_save_to,encoding='utf-8')
workbookvarname = XLWRITER_DF.book
worksheetvarname = XLWRITER_DF.sheets[sheet_name_save_to]
#format as numbers
def FCT_columns_numeric(dfname,columns_numeric):
for x in columns_numeric:
dfname.iloc[:,x] = pd.to_numeric(dfname.iloc[:,x])

Your FCT_universal_read function should not modify a dataframe but instead return a new one:
def FCT_universal_read(file_full_path):
if file_full_path.split('.')[-1] == "xls":
df = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
if file_full_path.split('.')[-1] == "csv":
df = pd.read_csv(file_full_path)
return df
And in your main, do:
dfname = tbfrm.FCT_universal_read(file_full_path)
Same answer for FCT_df_xlsxwriter, you should rewrite it with a return so that you can do:
XLWRITER_DF, workbookvarname,worksheetvarname = tbfrm.FCT_df_xlsxwriter(dfname,file_save_to,sheet_name_save_to)
To grasp how python is dealing with the arguments you pass to a function, you should read these blog posts:
https://jeffknupp.com/blog/2012/11/13/is-python-callbyvalue-or-callbyreference-neither/
https://robertheaton.com/2014/02/09/pythons-pass-by-object-reference-as-explained-by-philip-k-dick/

You need to update FCT_universal_read so that it returns the dataframe you want. There is no need to define the dataframe outside the function, simply create and return it
df = FCT_universal_read('/your/file/path')
def FCT_universal_read(file_full_path):
if ".xls" in file_full_path:
df = pd.read_excel(file_full_path) #optional arguments:sheetname='Sheet1', header=0 , dtype=object to preserve values
return df
if ".csv" in file_full_path:
df = pd.read_csv(file_full_path)
return df

Thanks so much to both of you !! I get the logic now :)! Thanks also for the documentation.
I sucessfully managed to do both functions. I had been struggling for several hours.
I like the .split function that you used which ensures the script only looks at the extension.
I updated FCT_xlsxwriter and FCT_universal_read as you were saying. Here are both functions corrected:
'''
format_operations.py
'''
def FCT_universal_read(file_full_path):
if "xls" in file_full_path.split('.')[-1]:
dfname = pd.read_excel(file_full_path) #example: C:/Tests/Bigdata.xlsx
return dfname
if "csv" in file_full_path.split('.')[-1]:
dfname = pd.read_csv(file_full_path)
return dfname
def FCT_df_xlsxwriter(dfname,file_save_to,sheet_name_save_to):
XLWRITER_DF = pd.ExcelWriter(file_save_to, engine='xlsxwriter')
dfname.to_excel(XLWRITER_DF, sheet_name=sheet_name_save_to,encoding='utf-8')
workbook = XLWRITER_DF.book
worksheet = XLWRITER_DF.sheets[sheet_name_save_to]
return XLWRITER_DF,workbook,worksheet
Here is how I call the two functions:
'''
main.py
'''
import format_operations as tbfrm
import pandas as pd
import xlsxwriter.utility
#settings
file_full_path= "C:/Tests/big_data.xlsx"
file_save_to= "C:/Tests/Xlsxwriter.xlsx"
sheet_name_save_to= "Xlswriter"
#functions
FILE_DF = tbfrm.FCT_universal_read(file_full_path)
XLWRITER_DF,workbook,worksheet = tbfrm.FCT_df_xlsxwriter(FILE_DF,file_save_to,sheet_name_save_to)

Related

Return Excel file from Azure Function via HTTP using Python

Use Case
Within a Logic App, I create some data using an Azure Function with a Pandas DataFrame. After employing the Azure Function, I want to further process the data in .xlsx format within the Logic App. Therefore I need the Azure Function to return an .xlsx file.
Problem
I am unable to format the HTTPResponse of my Azure Function so that I can further process the .xlsx file within the Logic App. Basically I require the correct conversion from my Pandas DataFrame to the HTTPResponse.
What to do in convert_to_xlsx() (see below) to achieve the desired output?
Toy Example
import azure.functions as func
import logging
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
def main(req: func.HttpRequest) -> func.HttpResponse:
df = pd.DataFrame(np.random.randint(0, 100, size=(2, 4)), columns=list('ABCD'))
excel = convert_to_xlsx(df)
return func.HttpResponse(excel, status_code=200)
def convert_to_xlsx(df):
# Create excel representation
wb = Workbook()
sheet = wb.active
for row in dataframe_to_rows(df, index=False, header=True):
sheet.append(row)
logging.info('sheet: ' + str(list(sheet.values))) # So far, so good.
# Convert for HTTPResponse
res = ''
res = do_something(sheet) # <---- What to do here?
return res
What I tried
I tried converting the data to openpyxl's Workbook, which worked fine. But then I did not know how to proceed from here to convert from a Workbook.
Also, there is this answer using xlrd, which I could not get to work for my use case. Additionally, xlrd does not support .xlsx anymore. Based on that post, I tried the following, which did not work as intended:
def convert_to_xlsx(df):
# ... see above
# Only returns column names without values.
# Also, apparently not the right format?
return f'{[row for row in sheet]}'
One option might be to return some kind of JSON response and then convert it back to an excel file within the logic app. But I hoped that I might be able to skip that and immediately return a .xlsx file from the function as HTTP payload.
In order to obtain an Excel file you also have to manipulate the header, see https://stackoverflow.com/a/67276395/7641854.
Without the changed header you will obtain a zip object I assume.
Thus a working example to return an Excelfile via Azure function looks like this:
import pandas as pd
import azure.functions as func
def main(req: func.HttpRequest) -> func.HttpResponse:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
buffer = io.BytesIO()
excel_buf = df.to_excel(buffer, index=False)
return func.HttpResponse(
buffer.getvalue(),
headers={"Content-Disposition": 'attachment; filename="test.xlsx"'},
mimetype='application/vnd.ms-excel',
status_code=200,
)
The approach could be to write the output to a buffer and return the buffer's content within the HTTPResponse
...
from io import BytesIO
def main(req: func.HttpRequest) -> func.HttpResponse:
df = pd.DataFrame(np.random.randint(0, 100, size=(2, 4)), columns=list('ABCD'))
buffer = BytesIO()
excel_buf = df.to_excel(buffer)
return func.HttpResponse(buffer.getvalue(), status_code=200)
However, due to concerns regarding file size and execution times for returning large files via http, I opted for uploading the resulting excel to an Azure BLOB storage, by using something like (snippet):
...
out_blob = BlobClient.from_connection_string(...)
excel_buf = df.to_excel(buffer)
out_blob.upload_blob(buffer.getvalue())
...

Loop over files

I have a series of files named file_0001.csv, file_0002.csv, ... file_1000.csv etc. I need to read them iteratively by creating a list of the filenames and as
import numpy as np
import pandas as pd
for fileName in files:
data = pd.read_csv("folder" + fileName)
data = data.values
How do I create the list of file names by checking the first and last file.
Thank you for your help
if you are trying to create the specific file names, you can loop over from 1 to 1000 and create filenames
import numpy as np
import pandas as pd
for i in range(1,1001):
num = str(i)
filename = "file_" + num.zfill(4) + ".csv"
#data = pd.read_csv("folder" + fileName)
#data = data.values
print filename
output last 10 lines:
file_0991.csv
file_0992.csv
file_0993.csv
file_0994.csv
file_0995.csv
file_0996.csv
file_0997.csv
file_0998.csv
file_0999.csv
file_1000.csv
You should use pathlib from the standard library. Then you can simply do
import pandas as pd
from pathlib import Path
# get file number assuming the name format "file_number.csv"
def get_file_number(file_path):
return int(file_path.stem.split("_")[1])
folder_path = Path("path/to/folder")
# sort files by file number
files = sorted(folder_path.glob("file_*.csv"), key=get_file_number)
for file in files:
print(file.name) # just to check
data = pd.read_csv(file)
# do something with data

Reading the csv and formatting it to a dictionary with Pandas Python

I ma trying to convert the Code below to pandas. How would I be able to do convert to have both reader functions equivalent so that account_strats function works. The pandas implementation is faulty giving error defined below, how would I be able to fix it and convert it into pandas type.
CSV:
Name,AccountName,Type
XXX,Account1,RSIStrategy
XYB,Account1,MACDStrategy2.0
XBR,Account1,STDandRegressionStrategy
Code that works:
import csv
for account in range(2):
file = open("SavedStrats.csv")
reader = csv.DictReader(file)
account_strats = [strat for strat in reader if strat["AccountName"] == account]
file.close()
Pandas Implementation(Code faulty):
import pandas as pd
for account in range(2):
reader= pd.read_csv("SavedStrats.csv").to_dict('dict')
account_strats = [strat for strat in reader if strat["AccountName"] == account]
Error:
TypeError: string indices must be integers
For your requirement, what you need is to_dict('records')
import pandas as pd
for account in range(2):
reader= pd.read_csv("SavedStrats.csv").to_dict('records')
account_strats = [strat for strat in reader if strat["AccountName"] == account]

How to reformat the resultant excel sheet after coming multiple excel sheet in Pandas Python

I tried combining multiple sheets of multiple excel into single excel using pandas python but in the end excel sheet,the rows labels are the excel sheet file name,each sheet as column name.I am getting it as messy.
How do I get it in proper format.Here is the code:
import pandas as pd
import os
from openpyxl.workbook import Workbook
os.chdir("C:/Users/w8/PycharmProjects/decorators_exaample/excel_files")
path = "C:/Users/w8/PycharmProjects/decorators_exaample/excel_files"
files = os.listdir(path)
AllFiles = pd.DataFrame()
for f in files:
info = pd.read_excel(f, sheet_name=None)
AllFiles=AllFiles.append(info, ignore_index=True)
writer = pd.ExcelWriter("Final.xlsx")
AllFiles.to_excel(writer)
writer.save()
The final excel looks like this :
enter image description here
you don't actually need the whole os and Workbook part. That could clean your code and ease finding errors. I assume, that path is the path to the folder where all the excel files are stored:
import pandas as pd
import glob
path = "C:\Users\w8\PycharmProjects\decorators_exaample\excel_files"
file_list = glob.glob(path)
df= pd.DataFrame()
for f in file_list :
info = pd.read_excel(f)
df = df.append(info)
df.to_excel('C:\Users\w8\PycharmProjects\decorators_exaample\excel_files\new_filename.xlsx')
should be as easy as that

how to push live panda Dataframe and index it to fit it in my Tkinter table?

I am trying to push my mqtt data to my tkinter table, which i have created using pandastable module. I am getting data in form of a list. So i first created a csv file, and i labeled it manually. And then i pushed my list to that csv file. So, i have two part in my table, first it will take converted dataframe from my csv file and is like the history part of table and then i need to push my recent dataframe(which is in same format as my previous dataframes with my csv file's column index as my dataframe column index ) to that table while its open. And also i am saving my recent datframes in csv file, so this process can in circle every time i open my table.Problem is i can't figure where i am going wrong.
this is my table script:
import tkinter as tk
import pandas as pd
from pandastable import Table, TableModel
from threading import Thread
import time
import datetime
import numpy as np
#import mqtt_cloud_rec
#import tkintermqtt
prevframe = pd.read_csv('mqttresult.csv')
class TestApp(tk.Frame):
"""Basic test frame for the table"""
def __init__(self, parent=None):
self.parent = parent
tk.Frame.__init__(self)
self.main = self.master
self.main.geometry('800x600+200+100')
self.main.title('Mqtt Result Table')
f = tk.Frame(self.main)
f.pack(fill=tk.BOTH,expand=1)
#df = TableModel.getSampleData(rows=5)
self.table = pt = Table(f, dataframe=prevframe, showtoolbar=True )
pt.show()
self.startbutton = tk.Button(self.main,text='START',command=self.start)
self.startbutton.pack(side=tk.TOP,fill=tk.X)
self.stopbutton = tk.Button(self.main,text='STOP',command=self.stop)
self.stopbutton.pack(side=tk.TOP,fill=tk.X)
# self.table.showPlotViewer()
return
def update(self,data):
table=self.table
#plotter = table.pf
#opts = plotter.mplopts
#plotter.setOption('linewidth',3)
#plotter.setOption('kind','line')
#opts.widgets['linewidth'].set(3)
#opts.widgets['kind'].set('line')
date_today=str(datetime.date.today())
time_today=time.strftime("%H:%M:%S")
datalist=[date_today,time_today]+self.data
datalist1=np.array(datalist)
datalist2=pd.DataFrame(data=datalist1 ,columns=['Date','Time','power state','Motor state','Mode','Voltage','Current','Power Factor','KW','KWH','total Runtime'])
#self.table = Table(dataframe=datalist2, showtoolbar=True )
self.dataframe.loc[len(self.dataframe)]=datalist2
table.model.df=self.dataframe
table.redraw()
#table.multiplecollist=range(0,10)
#table.plotSelected()
time.sleep(.1)
if self.stop == True:
return
return
def start(self):
self.stop=False
t = Thread(target=self.update)
t.start()
def stop(self):
self.stop = True
return
app = TestApp()
#launch the app
app.mainloop()
Convert this to a dictionary instead of a Dataframe and I think it will work:
datalist=[date_today,time_today]+self.data
datalist1=np.array(datalist)
datalist2=pd.DataFrame(data=datalist1 ,columns=['Date','Time','power state','Motor state','Mode','Voltage','Current','Power Factor','KW','KWH','total Runtime'])

Resources