Python Unicode warning when using ' - python-3.x

I am trying to run some code that will get data from a website(ques) and compare it to a CSV(ans) file to get a result.
def ans(ques):
ans = False
csvfile = open('database.csv', 'r')
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
if ques == row[0]:
ans = row[1]
break
if ques == row[1]:
ans = row[0]
break
return(ans)
CSV file:
QUESTION,ANSWER
1+1=?,It's 2
When i run this code i get this:
UnicodeWarning: Unicode equal comparison failed to convert both arguments to
Unicode - interpreting them as being unequal
However when i remove the ' it works as expected. Any help?

Related

Print the last occurrence of a word in a TXT in Python

I'm working on extracting data from a .txt file, and I want to pick up the last occurrence of a certain word in the whole file. In this case, I get three occurrences of the words D / DÑA and Tfno but I want only the last one of each one and print it.
def extract(in_filename):
if not os.path.exists(in_filename):
print("ERROR: Input file does not exist ❌ ")
sys.exit()
with open(in_filename, 'r') as file:
rows = file.readlines()
for row in rows:
if re.match('D/DÑA', row):
row_parse = row.strip()
print(row_parse)
elif re.match('Tfno', row):
row_parse = row.strip()
print(row_parse)
extract('apd29.txt')
The output is:
D/DÑA: PRUEBA PRUEBITA
Tfno: 666666666
D/DÑA: PRUEBA PRUEBITA
Tfno: 666666666
D/DÑA: PRUEBA PRUEBITA <-- I need this
Tfno: 666666666 <-- I need this
I expect the output:
D/DÑA: PRUEBA PRUEBITA
Tfno: 666666666
Use reversed
Ex:
def extract(in_filename):
if not os.path.exists(in_filename):
print("ERROR: Input file does not exist ❌ ")
sys.exit()
with open(in_filename, 'r') as file:
rows = file.readlines()
for row in reversed(rows): #!Update
if re.match(r'D/DÑA', row):
row_parse = row.strip()
print(row_parse)
break
extract('apd29.txt')
Assigning a variable outside of for loop would work in this situation.
result = "not found"
with open(in_filename, 'r') as file:
rows = file.readlines()
for row in rows:
if re.match('D/DÑA', row):
row_strip = row.strip()
row_parse = row_strip.rfind('D/DÑA')
result = row_parse
print(result)

Iterate through CSV up to a specific row

I have a current working script that iterates through rows in a CSV file currently, but I want to configure it to only read between a range, such as index 0 - 100, but am stuck on how to accomplish this properly.
Here is my current code:
for (name, subscriber) in zip(usernames, subscriber_ids):
#do stuff with name / subscriber in selenium driver
if len(elements) == 0:
# logs information
else:
#logs information
with open('file', encoding='utf-8-sig') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
usernames = []
subscriber_ids = []
for row in readCSV:
username = row[0]
subscriberid = row[1]
usernames.append(username)
subscriber_ids.append(subscriberid)
What I want to do is just read from row 0 - 100 or start at index 100-200 etc. Any assistance would be appreciated.
try something like this
with open("datafile") as myfile:
head = [next(myfile) for x in range(N)]
print(head)

Python file write issue with Pandas

i wrote this python script to search for unseen mail in a mailbox, download xlsx attachment, make some modification on it and then post them to another service.
All is working perfect with just one issue:
In the original xlsx file there is a column named "zona" containing the italian two letter string for the province.
If this value is "NA" (the value of the province of NAPLES) when
saving the resultant xlsx files has blank cell instead of NA.
is NA a reserved word and if yes, there is a way to quote it?
import os,email,imaplib,socket,requests
import pandas as pd
mail_user = os.environ.get('MAIL_USER')
mail_password = os.environ.get('MAIL_PASS')
mail_server = os.environ.get('MAIL_SERVER')
detach_dir = '.'
url=<removed url>
if mail_user is None or mail_password is None or mail_server is None:
print ('VARIABILI DI AMBIENTE NON DEFINITE')
exit(1)
try:
with imaplib.IMAP4_SSL(mail_server) as m:
try:
m.login(mail_user,mail_password)
m.select("INBOX")
resp, items = m.search(None, "UNSEEN")
items = items[0].split()
for emailid in items:
resp, data = m.fetch(emailid, "(RFC822)")
email_body = data[0][1] # getting the mail content
mail = email.message_from_bytes(email_body) # parsing the mail content to get a mail object
if mail.get_content_maintype() != 'multipart':
continue
for part in mail.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
if filename.endswith('.xlsx'):
att_path = os.path.join(detach_dir, filename)
fp = open(att_path, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
xl = pd.ExcelFile(att_path)
df1 = xl.parse(sheet_name=0)
df1 = df1.replace({'\'':''}, regex=True)
df1.loc[df1['Prodotto'] == 'SP_TABLETA_SAMSUNG','Cod. ID.'] = 'X'
df1.loc[df1['Prodotto'] == 'AP_TLC','Cod. ID.'] = 'X'
df1.loc[df1['Prodotto'] == 'APDCMB00003','Cod. ID.'] = 'X'
df1.loc[df1['Prodotto'] == 'APDCMB03252','Cod. ID.'] = 'X'
writer = pd.ExcelWriter(att_path, engine='xlsxwriter')
df1.to_excel(writer, sheet_name='Foglio1', index=False)
writer.save()
uf = {'files': open(att_path, 'rb')}
http.client.HTTPConnection.debuglevel = 0
r = requests.post(url, files=uf)
print (r.text)
except imaplib.IMAP4_SSL.error as e:
print (e)
exit(1)
except imaplib.IMAP4.error:
print ("Errore di connessione al server")
exit(1)
It seems that Pandas is treating the NA value as a NaN and therefore, when you write to excel it writes this value as '' by default (see docs).
You can pass na_rep='NA' to the to_excel() function in order to write it out as a string;
df1.to_excel(writer, sheet_name='Foglio1', index=False, na_rep='NA')
But as a precaution keep an eye out as any other NaN values present in your df will also be written to the excel file as 'NA'.
Reading the docs link post by #Matt B. i found this solution:
df1 = xl.parse(sheet_name=0, keep_default_na=False, na_values=['_'])
If i understand well only _ are interpreted as "not avalaible"

Using Python to delete rows in a csv file that contain certain chars

I have a csv file that I'm trying to clean up. I am trying to look at the first column and delete any rows that have anything other than chars for that row in the first column (I'm working on cleaning up rows where the first column has a ^ or . for now). It seems all my attempts either do nothing or nuke the whole csv file.
Interestingly enough, I have code that can identify the problem rows and it seems to work fine
def FindProblemRows():
with open('Data.csv') as csvDataFile:
ProblemRows = []
csvReader = csv.reader(csvDataFile)
data = [row for row in csv.reader(csvDataFile)]
length = len(data)
for i in range (0,length):
if data[i][0].find('^')!=-1 or data[i][0].find('.')!=-1:
ProblemRows.append(i)
return (ProblemRows)
Below I have my latest three failed attempts. Where am I going wrong and what should I change? Which of these comes closest?
'''
def Clean():
with open("Data.csv", "w", newline='') as f:
data = list(csv.reader(f))
writer = csv.writer(f)
Problems = FindProblemRows()
data = list(csv.reader(f))
length = len(data)
for row in data:
for i in Problems:
for j in range (0, length):
if row[j] == i:
writer.writerow(row)
Problems.remove(i)
def Clean():
Problems = FindProblemRows()
with open('Data.csv') as csvDataFile:
csvReader = csv.reader(csvDataFile)
data = [row for row in csv.reader(csvDataFile)]
length = len(data)
width = len(data[0])
with open("Data.csv","r") as csvFile:
csvReader = csv.reader( csvFile )
with open("CleansedData.csv","w") as csvResult:
csvWrite = csv.writer( csvResult )
for i in Problems:
for j in range (0, length):
if data[j] == i:
del data[j]
for j in range (0, length):
csvWrite.writerow(data[j])
'''
def Clean():
with open("Data.csv", 'r') as infile , open("CleansedData.csv", 'w') as outfile:
data = [row for row in infile]
for row in infile:
for column in row:
if "^" not in data[row][0]:
if "." not in data[row][0]:
outfile.write(data[row])
Update
Now I have:
def Clean():
df = pd.read_csv('Data.csv')
df = df['^' not in df.Symbol]
df = df['.' not in df.Symbol]
but I get KeyError: True
Shouldn't that work?
You should check whether the column Symbol contains any of the characters of interest. Method contains takes a regular expression:
bad_rows = df.Symbol.str.contains('[.^]')
df_clean = df[~bad_rows]

Saving list to a .csv file

I have made a code that opens a .csv file and takes a user input to filter it to a new list. What I am having trouble with is saving this new list to a .csv file properly.
This is my code:
#author: Joakim
from pprint import pprint
import csv
with open ('Change_in_Unemployment_2008-2014.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
next(readCSV) #Removing header
result = []
found = False
user_input = input("Please enter a full/partial NUTS code to filter by: ")
for row in readCSV:
if row[0].startswith(user_input):
result.append(row)
found = True
if found == False:
print("There are no registered NUTS codes containing your input.. Please try again")
if found == True:
print("\n Successfully found ", len(result), "entries!""\n")
pprint (result)
#store data in a new csv file
Stored_path = "C:\data_STORED.csv"
file = open(Stored_path, 'w')
writer = csv.writer(file)
writer.writerow(["NUTS CODE", " Municipality", " value"])
for i in range(len(result)):
new_row = result[i]
NUTS_CODE = new_row[0]
Municipality = new_row[1]
Value = new_row[2]
writer.writerow([NUTS_CODE, Municipality])
csvfile.close()
If one runs my code with an input of : PL, one gets this list:
[['PL11', 'odzkie', '2.2'],
['PL12', 'Mazowieckie', '1.2'],
['PL21', 'Maopolskie', '2.9'],
['PL22', 'Slaskie', '2'],
['PL31', 'Lubelskie', '1.1'],
['PL32', 'Podkarpackie', '5.8'],
['PL33', 'Swietokrzyskie', '2.6'],
['PL34', 'Podlaskie', '2.7'],
['PL41', 'Wielkopolskie', '1.6'],
['PL42', 'Zachodniopomorskie', '-1.1'],
['PL43', 'Lubuskie', '1.8'],
['PL51', 'Dolnoslaskie', '0'],
['PL52', 'Opolskie', '1.3'],
['PL61', 'Kujawsko-Pomorskie', '1.6'],
['PL62', 'Warminsko-Mazurskie', '2.4'],
['PL63', 'Pomorskie', '3.1']]'
Now I would like to store this neatly into a new .csv file, but when I use the code above, I only get a couple of values repeated throughout.
What is my error?

Resources