How to create a simple csv on the fly in python? - python-3.x

I have an endpoint that takes a csv file.
Now, I want to write a test that makes a post request with this file.
I am trying to generate this csv file on the fly (rather than manually create and store it)
I tried this:
def csv_fixture(rows, type):
headers = None
if type == "merchant_upload":
headers = MerchantCSV.ordered_columns()
elif type == "invoice_upload":
headers = InvoiceCSV.ordered_columns()
assert headers is not None
rows = [headers] + rows
with open("file.csv", "w+") as f:
writer = csv.writer(f)
writer.writerows(rows)
yield f
my_file = csv_fixture(merchants, type="merchant_upload")
request = rf.post("/invoice_admin/upload_organisations/",
{"onboarding_file": my_file})
My endpoint does something like this:
if filename not in request.FILES:
raise Exception("Upload Failed: No file submitted.")
file = TextIOWrapper(
request.FILES[filename].file, encoding=request.encoding)
headers = peek_first_row(file)
missing = required_cols - set(headers)
if missing:
raise Exception(f"Columns missing in csv: {str(missing)})")
return csv.DictReader(file)
My endpoint works if I manually upload the file. However, if I try doing it programatically with the first snipper, I get an error:
def peek_first_row(file):
rows = csv.reader(file)
> headers = next(rows)
E StopIteration
app/invoice_admin/csv_parser.py:11: StopIteration
Please could someone guide me? I have looked at lots of tutorials, but I'm lost at this point.

This might help.
Ex:
def csv_fixture(rows, type):
headers = None
if type == "merchant_upload":
headers = MerchantCSV.ordered_columns()
elif type == "invoice_upload":
headers = InvoiceCSV.ordered_columns()
assert headers is not None
rows = [headers] + rows
with open("file.csv", "w+") as f:
writer = csv.writer(f)
writer.writerows(rows)
return open("file.csv", "rb")
my_file = csv_fixture(merchants, type="merchant_upload")
request = rf.post("/invoice_admin/upload_organisations/",
files={"onboarding_file": my_file})

Related

Method reads properly but the written text file only has 1 line. Is \n not working?

The goal is to extract specific data from a text file under a folder
then write that data into another file under different folder
The extraction part works, save to variables and can even print them
The rises when you try to write them to a file
The file is empty
Need to write in this format
{self.title};;;{self.author};;;{self.release_date};;;
{self.last_update_date};;;{self.language};;;{self.producer};;;{self.book_path}
# This class includes all the operations related to a book
class Operation:
"""
Need to include these class variables
book_title_list (List of all books titles such as “[title1, title2, title3, …]”)
book_info_dict = “{title1: obj1, title2:obj2, title3:obj3…….}”)
"""
book_folder_path = './data/books_data/'
book_info_path = './data/result_data/books.txt'
def extract_book_info(self):
directory_files = os.listdir(self.book_folder_path) # Stores the .txt files under books_data folder
try:
for i in directory_files:
with open(f'{self.book_folder_path}/{i}', 'r', encoding='utf8') as f:
f_contents = f.readlines()
f_line_free = list(map(lambda x: x.strip(), f_contents))
f_lists = f_line_free[10:22] # Slicing only the required elements of the list
"""
Extracting only the necessary part and storing them
under proper variables
"""
title = f_lists[0]
author = f_lists[2]
release_date = f_lists[4]
last_update_date = f_lists[5]
language = f_lists[7]
producer = f_lists[11]
"""
Extracting the desired values
"""
title_data = title[7:]
author_data = author[8:]
release_date_data = release_date[14:]
last_update_date_data = last_update_date[24:-1]
language_data = language[10:]
producer_data = producer[13:]
print(title_data)
with open(self.book_info_path, 'w', encoding="utf8") as wf:
wf.write(f'{title_data};;;{author_data};;;{release_date_data};;;'
f'{last_update_date_data};;;{language_data};;;{producer_data};;;{self.book_info_path}\n')
return True
except FileNotFoundError:
return False
except Exception:
return False

.csv to .arff function on Python

I'm trying to do a convertion function from csv to arff, right now I have this:
def csv2arff(csv_path, arff_path=None):
with open(csv_path, 'r') as fr:
attributes = []
if arff_path is None:
arff_path = csv_path[:-4] + '_prueba.arff' # *.arff -> *.csv
write_sw = False
with open(arff_path, 'w') as fw:
fw.write('#relation base_datos_modelo_3_limpia \n')
firstline = fr.readlines()[0].rstrip()
fw.write(firstline)
and that gives me:
#relation base_datos_modelo_3_limpia
DVJ_Valgus_KneeMedialDisplacement_D_discr,BMI,AgeGroup,ROM-PADF-KE_D,DVJ_Valgus_FPPA_D_discr,TrainFrequency,DVJ_Valgus_FPPA_ND_discr,Asym_SLCMJLanding-pVGRF(10percent)_discr,Asym-ROM-PHIR(≥8)_discr,Asym_TJ_Valgus_FPPA(10percent)_discr,TJ_Valgus_FPPA_ND_discr,Asym-ROM-PHF-KE(≥8)_discr,TJ_Valgus_FPPA_D_discr,Asym_SLCMJ-Height(10percent)_discr,Asym_YBTpl(10percent)_discr,Position,Asym-ROM-PADF-KE(≥8º)_discr,DVJ_Valgus_KneeMedialDisplacement_ND_discr,DVJ_Valgus_Knee-to-ankle-ratio_discr,Asym-ROM-PKF(≥8)_discr,Asym-ROM-PHABD(≥8)_discr,Asym-ROM-PHF-KF(≥8)_discr,Asym-ROM-PHER(≥8)_discr,AsymYBTanterior10percentdiscr,Asym-ROM-PHABD-HF(≥8)_discr,Asym-ROM-PHE(≥8)_discr,Asym(>4cm)-DVJ_Valgus_Knee;edialDisplacement_discr,Asym_SLCMJTakeOff-pVGRF(10percent)_discr,Asym-ROM-PHADD(≥8)_discr,Asym-YBTcomposite(10percent)_discr,Asym_SingleHop(10percent)_discr,Asym_YBTpm(10percent)_discr,Asym_DVJ_Valgus_FPPA(10percent)_discr,Asym_SLCMJ-pLFT(10percent)_discr,DominantLeg,Asym-ROM-PADF-KF(≥8)_discr,ROM-PHER_ND,CPRDmentalskills,POMStension,STAI-R,ROM-PHER_D,ROM-PHIR_D,ROM-PADF-KF_ND,ROM-PADF-KF_D,Age_at_PHV,ROM-PHIR_ND,CPRDtcohesion,Eperience,ROM-PHABD-HF_D,MaturityOffset,Weight,ROM-PHADD_ND,Height,ROM-PHADD_D,Age,POMSdepressio,ROM-PADF-KE_ND,POMSanger,YBTanterior_Dnorm,YBTanterior_NDnorm,POMSvigour,Soft-Tissue_injury_≥4days
So i want to put "#attribute" before each attribute and change the "," to "\n". But don't know how to do it, I tried to make a function to change the "," but didn't work, any idea?
Thank you guys.
Try the liac-arff library.
Here is an example for converting the UCI iris dataset from ARFF to CSV and then back to ARFF:
import csv
import arff
# arff -> csv
content = arff.load(open('./iris.arff', 'r'))
with open('./out.csv', 'w') as fp:
writer = csv.writer(fp)
header = []
for n, t in content['attributes']:
header.append(n)
writer.writerow(header)
writer.writerows(content['data'])
# csv -> arff
with open('./out.csv', 'r') as fp:
reader = csv.reader(fp)
header = None
data = []
for row in reader:
if header is None:
header = row
else:
data.append(row)
content = {}
content['relation'] = "from my csv file"
content['attributes'] = []
for n in header:
if n == "class":
content['attributes'].append((n, ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']))
else:
content['attributes'].append((n, 'NUMERIC'))
content['data'] = data
with open('./out.arff', 'w') as fp:
arff.dump(content, fp)
NB: For the last stage, we need to specify the nominal class values, which you could determine by scanning the data.

Trouble reading csvs saved in sharefile (citrix)

I wrote the following code to create dataframes from files saved in sharefile. It works perfectly for excel files, but fails for csv files with the error EmptyDataError: No columns to parse from file.
tblname = 'test'
fPth = r'Z:\Favorites\test10 (Group D - Custom EM&V)\8 PII\16 - Project Selection Plan\QC\Data\test.csv'
sht = 'Gross_Data'
shtStart = 0
fType = 'csv'
fitem = sfsession.get_io_version(fPth)
if fitem is None:
print(f'Could not create sharefile item for {fPth}')
else:
try:
if fType == 'csv':
df = pd.read_csv(fitem.io_data, header = shtStart)
elif fType == 'excel':
df = pd.read_excel(fitem.io_data, sheet_name = sht, header = shtStart)
else:
pass
print(f'Data import COMPLETE for {fPth}: {str(datetime.now())}')
except:
print(f'Data import FAILED for {fPth}')
logging.critical(f'Data import FAILED for {fPth}')
If I replace fitem.io_data with fPth in df = pd.read_csv, the code works, but I can't use that as a permanent solution. Any suggestions?
Also sfsession is a sharefile session and get_io_version(fPth) gets the token and downloads all the file properties include its data.
Thanks.
An adaptation of this solution worked for me:
StringIO and pandas read_csv
I added fitem.io_data.seek(0) before the df = ... line
Closing the question.

Read a CSV from Google Cloud Storage using Google Cloud Functions in Python script

I'm new in GCP and I'm trying to do a simple API with Cloud Functions. This API needs to read a CSV from Google Cloud Storage bucket and return a JSON. To do this, in my local I can run normally, open a file.
But in Cloud Functions, I received a blob from bucket, and don know how manipulate this, I'm receiving error
I try convert blob to Bytes and to string but i don't know exactly how do it
Code working in my local env:
data1 = '2019-08-20'
data1 = datetime.datetime.strptime(data1, '%Y-%m-%d')
data2 = '2019-11-21'
data2 = datetime.datetime.strptime(data2, '%Y-%m-%d')
with open("/home/thiago/mycsvexample.csv", "r") as fin:
#create a CSV dictionary reader object
print(type(fin))
csv_dreader = csv.DictReader(fin)
#iterate over all rows in CSV dict reader
for row in csv_dreader:
#check for invalid Date values
#convert date string to a date object
date = datetime.datetime.strptime(row['date'], '%Y-%m-%d')
#check if date falls within requested range
if date >= data1 and date <= data2:
total = total + float(row['total'])
print(total)
Code in Google Cloud Functions:
import csv, datetime
from google.cloud import storage
from io import BytesIO
def get_orders(request):
"""Responds to any HTTP request.
Args:
request (flask.Request): HTTP request object.
Returns:
The response text or any set of values that can be turned into a
Response object using
`make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
"""
request_json = request.get_json()
if request.args and 'token' in request.args:
if request.args['token'] == 'mytoken888888':
client = storage.Client()
bucket = client.get_bucket('mybucketgoogle.appspot.com')
blob = bucket.get_blob('mycsvfile.csv')
byte_stream = BytesIO()
blob.download_to_file(byte_stream)
byte_stream.seek(0)
file = byte_stream
#with open(BytesIO(blob), "r") as fin:
#create a CSV dictionary reader object
csv_dreader = csv.DictReader(file)
#iterate over all rows in CSV dict reader
for row in csv_dreader:
#check for invalid Date values
date = datetime.datetime.strptime(row['date'], '%Y-%m-%d')
#check if date falls within requested range
if date >= datetime.datetime.strptime(request.args['start_date']) and date <= datetime.datetime.strptime(request.args['end_date']):
total = total + float(row['total'])
dict = {'total_faturado' : total}
return dict
else:
return f'Passe parametros corretos'
else:
return f'Passe parametros corretos'
Error in Google Cloud Functions:
Traceback (most recent call last): File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 346, in run_http_function result = _function_handler.invoke_user_function(flask.request) File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 217, in invoke_user_function return call_user_function(request_or_event) File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py", line 210, in call_user_function return self._user_function(request_or_event) File "/user_code/main.py", line 31, in get_orders_tramontina for row in csv_dreader: File "/opt/python3.7/lib/python3.7/csv.py", line 111, in __next__ self.fieldnames File "/opt/python3.7/lib/python3.7/csv.py", line 98, in fieldnames self._fieldnames = next(self.reader) _csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
I try do some other things but no sucess...
Someone can help me with this blob, to convert this or manipulate with the right way?
Thank you all
This is the code that worked for me:
from google.cloud import storage
import csv
client = storage.Client()
bucket = client.get_bucket('source')
blob = bucket.blob('file')
dest_file = '/tmp/file.csv'
blob.download_to_filename(dest_file)
dict = {}
total = 0
with open(dest_file) as fh:
# assuming your csv is del by comma
rd = csv.DictReader(fh, delimiter=',')
for row in rd:
date = datetime.datetime.strptime(row['date'], '%Y-%m-%d')
#check if date falls within requested range
if date >= datetime.datetime.strptime(request.args['start_date']) and date <= datetime.datetime.strptime(request.args['end_date']):
total = total + float(row['total'])
dict['total_faturado'] = total
I'm able to do this too using a library gcsfs
https://gcsfs.readthedocs.io/en/latest/
def get_orders_tramontina(request):
"""Responds to any HTTP request.
Args:
request (flask.Request): HTTP request object.
Returns:
The response text or any set of values that can be turned into a
Response object using
`make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
"""
request_json = request.get_json()
if request.args and 'token' in request.args:
if request.args['token'] == 'mytoken':
fs = gcsfs.GCSFileSystem(project='myproject')
total = 0
with fs.open('mybucket.appspot.com/mycsv.csv', "r") as fin:
csv_dreader = csv.DictReader(fin)
#iterate over all rows in CSV dict reader
for row in csv_dreader:
#check for invalid Date values
date = datetime.datetime.strptime(row['date'], '%Y-%m-%d')
#check if date falls within requested range
if date >= datetime.datetime.strptime(request.args['start_date'], '%Y-%m-%d') and date <= datetime.datetime.strptime(request.args['end_date'], '%Y-%m-%d'):
total = total + float(row['total'])
dict = {'total_faturado' : total}
return json.dumps(dict)```
Try to download file as string, that way you can check for invalid data values, and eventually write that to a file.
change blob.download_to_file(byte_stream) to my_blob_str = blob.download_as_string()
I think your actual problem is byte_stream = BytesIO() since your output reads iterator should return strings, not bytes (did you open the file in text mode?)
It is expecting a string, but gets bytes. What is the purpose of byte_stream? If random, just remove it.

Python file write issue with Pandas

i wrote this python script to search for unseen mail in a mailbox, download xlsx attachment, make some modification on it and then post them to another service.
All is working perfect with just one issue:
In the original xlsx file there is a column named "zona" containing the italian two letter string for the province.
If this value is "NA" (the value of the province of NAPLES) when
saving the resultant xlsx files has blank cell instead of NA.
is NA a reserved word and if yes, there is a way to quote it?
import os,email,imaplib,socket,requests
import pandas as pd
mail_user = os.environ.get('MAIL_USER')
mail_password = os.environ.get('MAIL_PASS')
mail_server = os.environ.get('MAIL_SERVER')
detach_dir = '.'
url=<removed url>
if mail_user is None or mail_password is None or mail_server is None:
print ('VARIABILI DI AMBIENTE NON DEFINITE')
exit(1)
try:
with imaplib.IMAP4_SSL(mail_server) as m:
try:
m.login(mail_user,mail_password)
m.select("INBOX")
resp, items = m.search(None, "UNSEEN")
items = items[0].split()
for emailid in items:
resp, data = m.fetch(emailid, "(RFC822)")
email_body = data[0][1] # getting the mail content
mail = email.message_from_bytes(email_body) # parsing the mail content to get a mail object
if mail.get_content_maintype() != 'multipart':
continue
for part in mail.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
if filename.endswith('.xlsx'):
att_path = os.path.join(detach_dir, filename)
fp = open(att_path, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
xl = pd.ExcelFile(att_path)
df1 = xl.parse(sheet_name=0)
df1 = df1.replace({'\'':''}, regex=True)
df1.loc[df1['Prodotto'] == 'SP_TABLETA_SAMSUNG','Cod. ID.'] = 'X'
df1.loc[df1['Prodotto'] == 'AP_TLC','Cod. ID.'] = 'X'
df1.loc[df1['Prodotto'] == 'APDCMB00003','Cod. ID.'] = 'X'
df1.loc[df1['Prodotto'] == 'APDCMB03252','Cod. ID.'] = 'X'
writer = pd.ExcelWriter(att_path, engine='xlsxwriter')
df1.to_excel(writer, sheet_name='Foglio1', index=False)
writer.save()
uf = {'files': open(att_path, 'rb')}
http.client.HTTPConnection.debuglevel = 0
r = requests.post(url, files=uf)
print (r.text)
except imaplib.IMAP4_SSL.error as e:
print (e)
exit(1)
except imaplib.IMAP4.error:
print ("Errore di connessione al server")
exit(1)
It seems that Pandas is treating the NA value as a NaN and therefore, when you write to excel it writes this value as '' by default (see docs).
You can pass na_rep='NA' to the to_excel() function in order to write it out as a string;
df1.to_excel(writer, sheet_name='Foglio1', index=False, na_rep='NA')
But as a precaution keep an eye out as any other NaN values present in your df will also be written to the excel file as 'NA'.
Reading the docs link post by #Matt B. i found this solution:
df1 = xl.parse(sheet_name=0, keep_default_na=False, na_values=['_'])
If i understand well only _ are interpreted as "not avalaible"

Resources