I'm trying to extract some legacy data from a Teradata server, but some of the records contain weird characters that don't register in python, such as "U+ffffffc2".
Currently,
I'm using pyodbc to extract the data from Teradata
Placing the results into a numpy array (because when I put it directly into pandas, It interprets all of the columns as a single column of type string)
Then I turn the numpy array into a pandas dataframe to change things like Decimal("09809") and Date("2015,11,14") into [09809,"11,14,2015"]
Then I try to write it to a file, where this error occurs
ValueError: character U+ffffffc2 is not in range [U+0000; U+10ffff]
I don't have access to edit this data, so from a client perspective what can I do to skip or, preferably, remove the character before writing it trying to write it to a file and getting the error?
Currently, I have a "try and except" block to skip queries with erroneous data, but I have to query the data in row chunks of at least 100. So if I just skip it, I lose 100 or more lines at a time. As I mentioned before, however, I would prefer to keep the line, but remove the character.
Here's my code. (Feel free to point out any bad practices as well!)
#Python 3.4
#Python Teradata Extraction
#Created 01/28/16 by Maz Baig
#dependencies
import pyodbc
import numpy as np
import pandas as pd
import sys
import os
import psutil
from datetime import datetime
#create a global variable for start time
start_time=datetime.now()
#create global process variable to keep track of memory usage
process=psutil.Process(os.getpid())
def ResultIter(curs, arraysize):
#Get the specified number of rows at a time
while True:
results = curs.fetchmany(arraysize)
if not results:
break
#for result in results:
yield results
def WriteResult(curs,file_path,full_count):
rate=100
rows_extracted=0
for result in ResultIter(curs,rate):
table_matrix=np.array(result)
#Get shape to make sure its not a 1d matrix
rows, length = table_matrix.shape
#if it is a 1D matrix, add a row of nothing to make sure pandas doesn't throw an error
if rows < 2:
dummyrow=np.zeros((1,length))
dummyrow[:]=None
df = pd.DataFrame(table_matrix)
#give the user a status update
rows_extracted=rows+rows_extracted
StatusUpdate(rows_extracted,full_count)
with open(file_path,'a') as f:
try:
df.to_csv(file_path,sep='\u0001',encoding='latin-1',header=False,index=False)
except ValueError:
#pass afterwards
print("This record was giving you issues")
print(table_matrix)
pass
print('\n')
if (rows_extracted < full_count):
print("All of the records were not extracted")
#print the run durration
print("Duration: "+str(datetime.now() - start_time))
sys.exit(3)
f.close()
def StatusUpdate(rows_ex,full_count):
print(" ::Rows Extracted:"+str(rows_ex)+" of "+str(full_count)+" | Memory Usage: "+str(process.memory_info().rss/78
def main(args):
#get Username and Password
usr = args[1]
pwd = args[2]
#Define Table
view_name=args[3]
table_name=args[4]
run_date=args[5]
#get the select statement as an input
select_statement=args[6]
if select_statement=='':
select_statement='*'
#create the output filename from tablename and run date
file_name=run_date + "_" + table_name +"_hist.dat"
file_path="/prod/data/cohl/rfnry/cohl_mort_loan_perfnc/temp/"+file_name
if ( not os.path.exists(file_path)):
#create connection
print("Logging In")
con_str = 'DRIVER={Teradata};DBCNAME=oneview;UID='+usr+';PWD='+pwd+';QUIETMODE=YES;'
conn = pyodbc.connect(con_str)
print("Logged In")
#Get number of records in the file
count_query = 'select count (*) from '+view_name+'.'+table_name
count_curs = conn.cursor()
count_curs.execute(count_query)
full_count = count_curs.fetchone()[0]
#Generate query to retrieve all of the table data
query = 'select '+select_statement+' from '+view_name+'.'+table_name
#create cursor
curs = conn.cursor()
#execute query
curs.execute(query)
#save contents of the query into a matrix
print("Writting Result Into File Now")
WriteResult(curs,file_path,full_count)
print("Table: "+table_name+" was successfully extracted")
#print the scripts run duration
print("Duration: "+str(datetime.now() - start_time))
sys.exit(0)
else:
print("AlreadyThere Exception\nThe file already exists at "+file_path+". Please remove it before continuing\n")
#print the scripts run duration
print("Duration: "+str(datetime.now() - start_time))
sys.exit(2)
main(sys.argv)
Thanks,
Maz
If you have only 4-byte unicode points giving an error, this probably may help.
One solution is to register a custom error handler using codecs.register_error, which would filter out error points and then just try to decode:
import codecs
def error_handler(error):
return '', error.end+6
codecs.register_error('nonunicode', error_handler)
b'abc\xffffffc2def'.decode(errors='nonunicode')
# gives you 'abcdef' which's exactly what you want
You may futher impove your handler to catch more complicated errors, see https://docs.python.org/3/library/exceptions.html#UnicodeError and https://docs.python.org/3/library/codecs.html#codecs.register_error for details
Related
I want to check a YouTube video's views and keep track of them over time. I wrote a script that works great:
import requests
import re
import pandas as pd
from datetime import datetime
import time
def check_views(link):
todays_date = datetime.now().strftime('%d-%m')
now_time = datetime.now().strftime('%H:%M')
#get the site
r = requests.get(link)
text = r.text
tag = re.compile('\d+ views')
views = re.findall(tag,text)[0]
#get the digit number of views. It's returned in a list so I need to get that item out
cleaned_views=re.findall('\d+',views)[0]
print(cleaned_views)
#append to the df
df.loc[len(df)] = [todays_date, now_time, int(cleaned_views)]
#df = df.append([todays_date, now_time, int(cleaned_views)],axis=0)
df.to_csv('views.csv')
return df
df = pd.DataFrame(columns=['Date','Time','Views'])
while True:
df = check_views('https://www.youtube.com/watch?v=gPHgRp70H8o&t=3s')
time.sleep(1800)
But now I want to use this function for multiple links. I want a different CSV file for each link. So I made a dictionary:
link_dict = {'link1':'https://www.youtube.com/watch?v=gPHgRp70H8o&t=3s',
'link2':'https://www.youtube.com/watch?v=ZPrAKuOBWzw'}
#this makes it easy for each csv file to be named for the corresponding link
The loop then becomes:
for key, value in link_dict.items():
df = check_views(value)
That seems to work passing the value of the dict (link) into the function. Inside the function, I just made sure to load the correct csv file at the beginning:
#Existing csv files
df=pd.read_csv(k+'.csv')
But then I'm getting an error when I go to append a new row to the df (“cannot set a row with mismatched columns”). I don't get that since it works just fine as the code written above. This is the part giving me an error:
df.loc[len(df)] = [todays_date, now_time, int(cleaned_views)]
What am I missing here? It seems like a super messy way using this dictionary method (I only have 2 links I want to check but rather than just duplicate a function I wanted to experiment more). Any tips? Thanks!
Figured it out! The problem was that I was saving the df as a csv and then trying to read back that csv later. When I saved the csv, I didn't use index=False with df.to_csv() so there was an extra column! When I was just testing with the dictionary, I was just reusing the df and even though I was saving it to a csv, the script kept using the df to do the actual adding of rows.
everyone, I am fairly new to using python for data analysis,so apologies for silly questions:
IDE : PyCharm
What I have : A massive .xyz file (with 4 columns) which is a combination of several datasets, each dataset can be determined by the third column of the file which goes from 10,000 to -10,000 with 0 in between and 100 as spacing and repeats (so every 201 rows is one dataset)
What I want to do : Split the massive file into its individual datasets (201 rows each)and save each file under a different name.
What I have done so far :
# Import packages
import os
import pandas as pd
import numpy as np #For next steps
import math #For next steps
#Check and Change directory
path = 'C:/Clayton/lines/profiles_aufmod'
os.chdir(path)
print(os.getcwd()) #Correct path is printed
# split the xyz file into different files for each profile
main_xyz = 'bathy_SPO_1984_50x50_profile.xyz'
number_lines = sum(1 for row in (open(main_xyz)))
print(number_lines) # 10854 is the output
rowsize = 201
for i in range(number_lines, rowsize):
profile_raw_df = pd.read_csv(main_xyz, delimiter=',', header=None, nrows=rowsize,
skiprows=i)
out_xyz = 'Profile' + str(i) + '.xyz'
profile_raw_df.to_csv(out_xyz, index=False,
header=False, mode='a')
Problems I am facing :
The for loop was at first giving output files as seen in the image,check Proof of output but now it does not produce any outputs and it is not rewriting the previous files either. The other mystery is that I am not getting an error either,check Code executed without error.
What I tried to fix the issue :
I updated all the packages and restarted Pycharm
I ran each line of code one by one and everything works until the for loop
While counting the number of rows in
number_lines = sum(1 for row in (open(main_xyz)))
you have exhausted the iterator that loops over the lines of the file. But you do not close the file. But this should not prevent Pandas from reading the same file.
A better idiom would be
with open(main_xyz) as fh:
number_lines = sum(1 for row in fh)
Your for loop as it stands does not do what you probably want. I guess you want:
for i in range(0, number_lines, rowsize):
so, rowsize is the step-size, instead of the end value of the for loop.
If you want to number the output files by data set, keep a counnt of the dataset, like this
data_set = 0
for i in range(0, number_lines, rowsize):
data_set += 1
...
out_xyz = f"Profile{data_set}.xyz"
...
im trying to print values from database and im getting this output:
[('CPDK0NHYX9JUSZUYASRVFNOMKH',), ('CPDK0KUEQULOAYXHSGUEZQGNFK',), ('CPDK0MOBWIG0T5Z76BUVXU5Y5N',), ('CPDK0FZE3LDHXEJRREMR0QZ0MH',)]
but will like to have this fromat:
'CPDK0NHYX9JUSZUYASRVFNOMKH'|'CPDK0KUEQULOAYXHSGUEZQGNFK'|'CPDK0MOBWIG0T5Z76BUVXU5Y5N'|'CPDK0FZE3LDHXEJRREMR0QZ0MH'
Python3
existing code
from coinpayments import CoinPaymentsAPI
from datetime import datetime
from lib.connect import *
import argparse
import json
sql = 'SELECT txn_id FROM coinpayment_transactions WHERE status = 0 '
mycursor.execute(sql)
result = mycursor.fetchall()
mydb.close()
print(result)
What you are getting is a list of tuples and it is stored in result object. If you want the output to be formatted the way you say then do this
#Paste this instead of print(result)
output=''
for i in result:
if (output!=''):
output=output+'|'+"'"+i[0]+"'"
else:
output=output+"'"+i[0]+"'"
print(output)
The better way to do these kinds of thing is using join and format() methods of string.
Here is your solution:
output = '|'.join([f"'{row[0]}'" for row in result])
print(output)
I have a piece of code. When I run this code. It is compiling but not showing any print result. I want to print the returned values from this function. Can someone please guide me where I'm wrong?
`def input_data(prefix):
datafiles=os.listdir('/home/zeri/Desktop/check2')
dictData={}
for df in datafiles:
if re.match(prefix,df) and
os.path.isfile('/home/zeri/Desktop/check2'+'/'+df):
hmax=locale.atof(df[3:])
print hmax
data=np.genfromtxt(df, delimiter=' ')
dictData[hmax]=data
return dictData,len(data[0])
int main():
a=input_data('xyz')
print a`
Python is not C. So, "int main()" does not work. Better remove this line altogether, although you can define a function called "main".
But probably you have mainly an indentation issue. I tried to fix this in the code below.
import locale
import numpy as np
import re
def input_data(prefix):
datafiles = os.listdir('/home/zeri/Desktop/check2')
dictData = {}
for df in datafiles:
if re.match(prefix, df) and os.path.isfile('/home/zeri/Desktop/check2' + '/' + df):
hmax = locale.atof(df[3:])
print hmax # use "print(a)" if on Python 3
data = np.genfromtxt(df, delimiter = ' ')
dictData[hmax] = data
return dictData, len(data[0])
a = input_data('xyz')
print a # use "print(a)" if on Python 3
By the way, I would not use regular expressions to filter files.
I have fairly large csv files that I need to manipulate/amend line-by-line (as each line may require different amending rules) then write them out to another csv with the proper formatting.
Currently, I have:
import multiprocessing
def read(buffer):
pool = multiprocessing.Pool(4)
with open("/path/to/file.csv", 'r') as f:
while True:
lines = pool.map(format_data, f.readlines(buffer))
if not lines:
break
yield lines
def format_data(row):
row = row.split(',') # Because readlines() returns a string
# Do formatting via list comprehension
return row
def main():
buf = 65535
rows = read(buf)
with open("/path/to/new.csv",'w') as out:
writer = csv.writer(f, lineterminator='\n')
while rows:
try:
writer.writerows(next(rows))
except StopIteration:
break
Even though I'm using multiprocessing via map and preventing memory overload with a generator, it still takes me well over 2 min to process 40,000 lines. It honestly shouldn't take that much. I've even generated a nested list from the generator outputs and trying to write the data as one large file at one time, vice a chunk-by-chunk method and still it takes as long. What am I doing wrong here?
I have figured it out.
First, the issue was in my format_data() function. It was making a call to a database connection that, every time it ran, it constructed the database connection and closed it with each iteration.
I fixed it by creating a basic mapping via a dictionary for an exponentially faster lookup table that supports multithreading.
So, my code looks like this:
import multiprocessing
def read(buffer):
pool = multiprocessing.Pool(4)
with open("/path/to/file.csv", 'r') as f:
while True:
lines = pool.map(format_data, f.readlines(buffer))
if not lines:
break
yield lines
def format_data(row):
row = row.split(',') # Because readlines() returns a string
# Do formatting via list comprehension AND a dictionary lookup
# vice a database connection
return row
def main():
rows = read(1024*1024)
with open("/path/to/new.csv",'w') as out:
while rows:
try:
csv.writer(f, lineterminator='\n').writerows(next(rows))
except StopIteration:
break
I was able to parse a ~150MB file in less than 30 sec. Some lessons learned here for others to hopefully learn from.