Reading from csv file and printing as organized table - python-3.x

Picture stating what needs to be done
Picture showing what format it needs to be done
I've got an assignment due regarding reading from a csv file and printing it as a organized table.
I have so far achieved the following code which prints the code in a string fomat which will also be shown below.
import csv
table = ''
with open('geek-music.csv', 'r') as csvFile:
reader = csv.DictReader(csvFile, delimiter=',')
table = '<tr>{}</tr>'.format(''.join(['<td>{}</td>'.format(header) for header in reader.fieldnames]))
for row in reader:
table_row = '<tr>'
for fn in reader.fieldnames:
table_row += '<td>{}<\td>'.format(row[fn])
table_row += '<\tr>'
table += table_row
When using this code I receive the following:
[['Track', 'Artist', 'Album', 'Time'], ['Computer Love', 'Kraftwerk', 'Computer World', '7:15'], ['Paranoid Android', 'Radiohead', 'OK Computer', '6:27'], ['Computer Age', 'Neil Young', 'Trans', '5:24'], ['Digital', 'Joy Division', 'Still', '2:50'], ['Silver Machine', 'Hawkwind', 'Roadhawks', '4:39'], ['Start the Simulator', 'A-Ha', 'Foot of the Mountain', '5:11'], ['Internet Connection', 'M.I.A.', 'MAYA', '2:56'], ['Deep Blue', 'Arcade Fire', 'The Suburbs', '4:29'], ['I Will Derive!', 'MindofMatthew', 'You Tube', '3:17'], ['Lobachevsky', 'Tom Lehrer', 'You Tube', '3:04']]
After further editing of my code I have resulted with the following:
import csv
f = open("geek-music.csv",'r')
for aline in f:
values = aline.split(',')
# print(values[0])
Track = [values[0]]
Artist = [values[1]]
Album = [values[2]]
Time = [values[3]]
data = list(zip(Track, Artist, Album, Time))
for i, d in enumerate(data):
line = '|'.join(str(x).ljust(12) for x in d)
print(line)
if i == 0:
print('-' * len(line))
f.close()
This however results in the following:
Track |Artist |Album |Time
---------------------------------------------------
Computer Love|Kraftwerk |Computer World|7:15
------------------------------------------------------
Paranoid Android|Radiohead |OK Computer |6:27
-------------------------------------------------------
Computer Age|Neil Young |Trans |5:24
---------------------------------------------------
Digital |Joy Division|Still |2:50
---------------------------------------------------
Silver Machine|Hawkwind |Roadhawks |4:39
-----------------------------------------------------
Start the Simulator|A-Ha |Foot of the Mountain|5:11
------------------------------------------------------------------
Internet Connection|M.I.A. |MAYA |2:56
----------------------------------------------------------
Deep Blue |Arcade Fire |The Suburbs |4:29
---------------------------------------------------
I Will Derive!|MindofMatthew|You Tube |3:17
------------------------------------------------------
Lobachevsky |Tom Lehrer |You Tube |3:04
---------------------------------------------------
This is very close to what I am looking for except that it is still quite far from what is shown in the image above.

Related

Regex Error and Improvement Driving Licence Data Extraction

I am trying to extract the Name, License No., Date Of Issue and Validity from an Image I processed using Pytesseract. I am quite a lot confused with regex but still went through few documentations and codes over the web.
I got till here:
import pytesseract
import cv2
import re
import cv2
from PIL import Image
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta
def driver_license(filename):
"""
This function will handle the core OCR processing of images.
"""
i = cv2.imread(filename)
newdata=pytesseract.image_to_osd(i)
angle = re.search('(?<=Rotate: )\d+', newdata).group(0)
angle = int(angle)
i = Image.open(filename)
if angle != 0:
#with Image.open("ro2.jpg") as i:
rot_angle = 360 - angle
i = i.rotate(rot_angle, expand="True")
i.save(filename)
i = cv2.imread(filename)
# Convert to gray
i = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)
# Apply dilation and erosion to remove some noise
kernel = np.ones((1, 1), np.uint8)
i = cv2.dilate(i, kernel, iterations=1)
i = cv2.erode(i, kernel, iterations=1)
txt = pytesseract.image_to_string(i)
print(txt)
text = []
data = {
'firstName': None,
'lastName': None,
'age': None,
'documentNumber': None
}
c = 0
print(txt)
#Splitting lines
lines = txt.split('\n')
for lin in lines:
c = c + 1
s = lin.strip()
s = s.replace('\n','')
if s:
s = s.rstrip()
s = s.lstrip()
text.append(s)
try:
if re.match(r".*Name|.*name|.*NAME", s):
name = re.sub('[^a-zA-Z]+', ' ', s)
name = name.replace('Name', '')
name = name.replace('name', '')
name = name.replace('NAME', '')
name = name.replace(':', '')
name = name.rstrip()
name = name.lstrip()
nmlt = name.split(" ")
data['firstName'] = " ".join(nmlt[:len(nmlt)-1])
data['lastName'] = nmlt[-1]
if re.search(r"[a-zA-Z][a-zA-Z]-\d{13}", s):
data['documentNumber'] = re.search(r'[a-zA-Z][a-zA-Z]-\d{13}', s)
data['documentNumber'] = data['documentNumber'].group().replace('-', '')
if not data['firstName']:
name = lines[c]
name = re.sub('[^a-zA-Z]+', ' ', name)
name = name.rstrip()
name = name.lstrip()
nmlt = name.split(" ")
data['firstName'] = " ".join(nmlt[:len(nmlt)-1])
data['lastName'] = nmlt[-1]
if re.search(r"[a-zA-Z][a-zA-Z]\d{2} \d{11}", s):
data['documentNumber'] = re.search(r'[a-zA-Z][a-zA-Z]\d{2} \d{11}', s)
data['documentNumber'] = data['documentNumber'].group().replace(' ', '')
if not data['firstName']:
name = lines[c]
name = re.sub('[^a-zA-Z]+', ' ', name)
name = name.rstrip()
name = name.lstrip()
nmlt = name.split(" ")
data['firstName'] = " ".join(nmlt[:len(nmlt)-1])
data['lastName'] = nmlt[-1]
if re.match(r".*DOB|.*dob|.*Dob", s):
yob = re.sub('[^0-9]+', ' ', s)
yob = re.search(r'\d\d\d\d', yob)
data['age'] = datetime.datetime.now().year - int(yob.group())
except:
pass
print(data)
I need to extract the Validity and Issue Date as well. But not getting anywhere near it. Also, I have seen using regex shortens the code like a lot so is there any better optimal way for it?
My input data is a string somewhat like this:
Transport Department Government of NCT of Delhi
Licence to Drive Vehicles Throughout India
Licence No. : DL-0820100052000 (P) R
N : PARMINDER PAL SINGH GILL
: SHRI DARSHAN SINGH GILL
DOB: 10/05/1966 BG: U
Address :
104 SHARDA APPTT WEST ENCLAVE
PITAMPURA DELHI 110034
Auth to Drive Date of Issue
M.CYL. 24/02/2010
LMV-NT 24/02/2010
(Holder's Sig natu re)
Issue Date : 20/05/2016
Validity(NT) : 19/05/2021 : c
Validity(T) : NA Issuing Authority
InvCarrNo : NA NWZ-I, WAZIRPUR
Or like this:
in
Transport Department Government of NCT of Delhi
Licence to Drive Vehicles Throughout India
2
Licence No. : DL-0320170595326 () WN
Name : AZAZ AHAMADSIDDIQUIE
s/w/D : SALAHUDDIN ALI
____... DOB: 26/12/1992 BG: O+
\ \ Address:
—.~J ~—; ROO NO-25 AMK BOYS HOSTEL, J.
— NAGAR, DELHI 110025
Auth to Drive Date of Issue
M.CYL. 12/12/2017
4 wt 4
Iseue Date: 12/12/2017 a
falidity(NT) < 2037
Validity(T) +: NA /
Inv CarrNo : NA te sntian sana
Note: In the second example you wouldn't get the validity, will optimise the OCR for later. Any proper guide which can help me with regex which is a bit simpler would be good.
You can use this pattern: (?<=KEY\s*:\s*)\b[^\n]+ and replace KEY with one of the issues of the date, License No. and others.
Also for this pattern, you need to use regex library.
Code:
import regex
text1 = """
Transport Department Government of NCT of Delhi
Licence to Drive Vehicles Throughout India
Licence No. : DL-0820100052000 (P) R
N : PARMINDER PAL SINGH GILL
: SHRI DARSHAN SINGH GILL
DOB: 10/05/1966 BG: U
Address :
104 SHARDA APPTT WEST ENCLAVE
PITAMPURA DELHI 110034
Auth to Drive Date of Issue
M.CYL. 24/02/2010
LMV-NT 24/02/2010
(Holder's Sig natu re)
Issue Date : 20/05/2016
Validity(NT) : 19/05/2021 : c
Validity(T) : NA Issuing Authority
InvCarrNo : NA NWZ-I, WAZIRPUR
"""
for key in ('Issue Date', 'Licence No\.', 'N', 'Validity\(NT\)'):
print(regex.findall(fr"(?<={key}\s*:\s*)\b[^\n]+", text1, regex.IGNORECASE))
Output:
['20/05/2016']
['DL-0820100052000 (P) R']
['PARMINDER PAL SINGH GILL']
['19/05/2021 : c']
You can also use re with a single regex based on alternation that will capture your keys and values:
import re
text = "Transport Department Government of NCT of Delhi\nLicence to Drive Vehicles Throughout India\n\nLicence No. : DL-0820100052000 (P) R\nN : PARMINDER PAL SINGH GILL\n\n: SHRI DARSHAN SINGH GILL\n\nDOB: 10/05/1966 BG: U\nAddress :\n\n104 SHARDA APPTT WEST ENCLAVE\nPITAMPURA DELHI 110034\n\n\n\nAuth to Drive Date of Issue\nM.CYL. 24/02/2010\nLMV-NT 24/02/2010\n\n(Holder's Sig natu re)\n\nIssue Date : 20/05/2016\nValidity(NT) : 19/05/2021 : c\nValidity(T) : NA Issuing Authority\nInvCarrNo : NA NWZ-I, WAZIRPUR"
search_phrases = ['Issue Date', 'Licence No.', 'N', 'Validity(NT)']
reg = r"\b({})\s*:\W*(.+)".format( "|".join(sorted(map(re.escape, search_phrases), key=len, reverse=True)) )
print(re.findall(reg, text, re.IGNORECASE))
Output of this short online Python demo:
[('Licence No.', 'DL-0820100052000 (P) R'), ('N', 'PARMINDER PAL SINGH GILL'), ('Issue Date', '20/05/2016'), ('Validity(NT)', '19/05/2021 : c')]
The regex is
\b(Validity\(NT\)|Licence\ No\.|Issue\ Date|N)\s*:\W*(.+)
See its online demo.
Details:
map(re.escape, search_phrases) - escapes all special chars in your search phrases to be used as literal texts in a regex (else, . will match any chars, ? won't match a ? char, etc.)
sorted(..., key=len, reverse=True) - sorts the search phrases by length in descending order (to get longer matches first)
"|".join(...) - creates an alternation pattern, a|b|c|...
r"\b({})\s*:\W*(.+)".format( ... ) - creates the final regex.
Regex details
\b - a word boundary (NOTE: replace with (?m)^ if your matches occur at the beginning of a line)
(Validity\(NT\)|Licence\ No\.|Issue\ Date|N) - Group 1: one of the search phrases
\s* - zero or more whitespaces
: - a colon
\W* - zero or more non-word chars
(.+) - (capturing) Group 2: one or more chars other than line break chars, as many as possible.

Is there a way to Keep track of all the bad records that are allowed while loading a ndjson file into Bigquery

I have a requirement where I need to keep track of all the bad records that were not feeded into bigquery after allowing max_bad_records. So I need them written in a File on storage for Future reference. I'm using BQ API for Python, Is there a way we can achieve this? I think if we are allowing max_bad_records we dont have the details of failed loads in BQ Load Job.
Thanks
Currently, there isn't a direct way of accessing and saving the bad records. However, you can access some job statistics including the reason why the record was skipped within BigQuery _job_statistics().
I have created an example, in order to demonstrate how the statistics will be shown. I have the following sample .csv file in a GCS bucket:
name,age
robert,25
felix,23
john,john
As you can see, the last row is a bad record, because I will import age as INT64 and there is a string in that row. In addition, I used the following code to upload it to BigQuery:
from google.cloud import bigquery
client = bigquery.Client()
table_ref = client.dataset('dataset').table('table_name')
job_config = bigquery.LoadJobConfig(
schema=[
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("age", "INT64"),
]
)
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job_config.skip_leading_rows = 1
job_config.max_bad_records = 5
#job_config.autodetect = True
# The source format defaults to CSV, so the line below is optional.
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://path/file.csv"
load_job = client.load_table_from_uri(
uri, table_ref, job_config=job_config
) # API request
print("Starting job {}".format(load_job.job_id))
load_job.result() # Waits for table load to complete.
print("Job finished.")
destination_table = client.get_table(table_ref)
print("Loaded {} rows.".format(destination_table.num_rows))
#Below all the statistics that might be useful in your case
job_state = load_job.state
job_id = load_job.job_id
error_result = load_job.error_result
job_statistics = load_job._job_statistics()
badRecords = job_statistics['badRecords']
outputRows = job_statistics['outputRows']
inputFiles = job_statistics['inputFiles']
inputFileBytes = job_statistics['inputFileBytes']
outputBytes = job_statistics['outputBytes']
print("***************************** ")
print(" job_state: " + str(job_state))
print(" non fatal error: " + str(load_job.errors))
print(" error_result: " + str(error_result))
print(" job_id: " + str(job_id))
print(" badRecords: " + str(badRecords))
print(" outputRows: " + str(outputRows))
print(" inputFiles: " + str(inputFiles))
print(" inputFileBytes: " + str(inputFileBytes))
print(" outputBytes: " + str(outputBytes))
print(" ***************************** ")
print("------ load_job.errors ")
The output from the statistics :
*****************************
job_state: DONE
non fatal errors: [{u'reason': u'invalid', u'message': u"Error while reading data, error message: Could not parse 'john' as INT64 for field age (position 1) starting at location 23", u'location': u'gs://path/file.csv'}]
error_result: None
job_id: b2b63e39-a5fb-47df-b12b-41a835f5cf5a
badRecords: 1
outputRows: 2
inputFiles: 1
inputFileBytes: 33
outputBytes: 26
*****************************
As it is shown above, the erros field returns the non fatal errors, which includes the bad records. In other words, it retrieves individual errors generated by the job. Whereas, the error_result returns the error information as the job as a whole.
I believe these statistics might help you analyse your bad records. Lastly, you can output them into a file, using write(), such as:
with open("errors.txt", "x") as f:
f.write(load_job.errors)
f.close()

How to handle blank line,junk line and \n while converting an input file to csv file

Below is the sample data in input file. I need to process this file and turn it into a csv file. With some help, I was able to convert it to csv file. However not fully converted to csv since I am not able to handle \n, junk line(2nd line) and blank line(4th line). Also, i need help to filter transaction_type i.e., avoid "rewrite" transaction_type
{"transaction_type": "new", "policynum": 4994949}
44uu094u4
{"transaction_type": "renewal", "policynum": 3848848,"reason": "Impressed with \n the Service"}
{"transaction_type": "cancel", "policynum": 49494949, "cancel_table":[{"cancel_cd": "AU"}, {"cancel_cd": "AA"}]}
{"transaction_type": "rewrite", "policynum": 5634549}
Below is the code
import ast
import csv
with open('test_policy', 'r') as in_f, open('test_policy.csv', 'w') as out_f:
data = in_f.readlines()
writer = csv.DictWriter(
out_f,
fieldnames=[
'transaction_type', 'policynum', 'cancel_cd','reason'],lineterminator='\n',
extrasaction='ignore')
writer.writeheader()
for row in data:
dict_row = ast.literal_eval(row)
if 'cancel_table' in dict_row:
cancel_table = dict_row['cancel_table']
cancel_cd= []
for cancel_row in cancel_table:
cancel_cd.append(cancel_row['cancel_cd'])
dict_row['cancel_cd'] = ','.join(cancel_cd)
writer.writerow(dict_row)
Below is my output not considering the junk line,blank line and transaction type "rewrite".
transaction_type,policynum,cancel_cd,reason
new,4994949,,
renewal,3848848,,"Impressed with
the Service"
cancel,49494949,"AU,AA",
Expected output
transaction_type,policynum,cancel_cd,reason
new,4994949,,
renewal,3848848,,"Impressed with the Service"
cancel,49494949,"AU,AA",
Hmm I try to fix them but I do not know how CSV file work, but my small knoll age will suggest you to run this code before to convert the file.
txt = {"transaction_type": "renewal",
"policynum": 3848848,
"reason": "Impressed with \n the Service"}
newTxt = {}
for i,j in txt.items():
# local var (temporar)
lastX = ""
correctJ = ""
# check if in J is ascii white space "\n" and get it out
if "\n" in f"b'{j}'":
j = j.replace("\n", "")
# for grammar purpose check if
# J have at least one space
if " " in str(j):
# if yes check it closer (one by one)
for x in ([j[y:y+1] for y in range(0, len(j), 1)]):
# if 2 spaces are consecutive pass the last one
if x == " " and lastX == " ":
pass
# if not update correctJ with new values
else:
correctJ += x
# remember what was the last value checked
lastX = x
# at the end make J to be the correctJ (just in case J has not grammar errors)
j = correctJ
# add the corrections to a new dictionary
newTxt[i]=j
# show the resoult
print(f"txt = {txt}\nnewTxt = {newTxt}")
Termina:
txt = {'transaction_type': 'renewal', 'policynum': 3848848, 'reason': 'Impressed with \n the Service'}
newTxt = {'transaction_type': 'renewal', 'policynum': 3848848, 'reason': 'Impressed with the Service'}
Process finished with exit code 0

How to swap string and save

I am having problems saving the file i modifyed basicly i need to replace in original file string called DTC_5814_removing and switch_data and save it as a seperate file how would i do that, so here is what program basicly does, it opens eeprom file, then searches for a string between two strings and groups it, then counts the data and by that given data searches for other string that is between two strings and modyfies data,basicly the code works i have a question how is the best way to save that as a seperate file, filesave function currently has no functin
here is the code:
import re
#checking the structures counting
file = open ("eeprom", "rb") .read().hex()
filesave = open("eepromMOD", "wb")
DTC_data = re.search("ffff30(.*)100077", file)
DTC_data_final = print (DTC_data.group(1))
#finds string between two strings in 2nd line of eeprom file
switch_data = re.search("010607(.*)313132", file)
switch_data_final = print (switch_data.group(1))
#finds string betwenn two strings in 3rd line of eeprom file
DTC_data_lenght = (len(DTC_data.group(1)))
#lenght of the whole DTC_data group
DTC_312D = re.search("ffff30(.*)312d", file)
DTC_3036 = re.search("ffff30(.*)3036", file)
DTC_5814 = re.search("ffff30(.*)5814", file)
#searching for DTC 312D
DTC_312D_lenght = (len(DTC_312D.group(1))+4)
DTC_312D_lenght_start =(len(DTC_312D.group(1)))
DTC_3036_lenght = (len(DTC_3036.group(1))+4)
DTC_3036_lenght_start =(len(DTC_3036.group(1)))
DTC_5814_lenght = (len(DTC_5814.group(1))+4)
DTC_5814_lenght_start =(len(DTC_5814.group(1)))
#confirming the lenght of the DTC table
if DTC_312D_lenght <= DTC_data_lenght and DTC_312D_lenght%4==0 :
#If dtc lenght shorter than whole table and devidable by 4
print("Starting DTC removal")
#Printing for good count
switch_data_lenght = (len(switch_data.group(1)))
#Counting switch data table
DTC_312D_removing = switch_data.group(1)[:DTC_312D_lenght_start] + "0000" + switch_data.group(1)[DTC_312D_lenght:]
#Read from data group (data[:define start] + "mod to wish value" + data[define end]
print(DTC_312D_removing)
else:
print("DTC non existant or incorrect")
if DTC_3036_lenght <= DTC_data_lenght and DTC_3036_lenght%4==0 :
#If dtc lenght shorter than whole table and devidable by 4
print("Starting DTC removal")
#Printing for good count
switch_data_lenght = (len(switch_data.group(1)))
#Counting switch data table
DTC_3036_removing = DTC_312D_removing[:DTC_3036_lenght_start] + "0000" + switch_data.group(1)[DTC_3036_lenght:]
#Read from data group (data[:define start] + "mod to wish value" + data[define end]
print(DTC_3036_removing)
else:
print("DTC non existant or incorrect")
if DTC_5814_lenght <= DTC_data_lenght and DTC_5814_lenght%4==0 :
#If dtc lenght shorter than whole table and devidable by 4
print("Starting DTC removal")
#Printing for good count
switch_data_lenght = (len(switch_data.group(1)))
#Counting switch data table
DTC_5814_removing = DTC_3036_removing[:DTC_5814_lenght_start] + "0000" + switch_data.group(1)[DTC_5814_lenght:]
#Read from data group (data[:define start] + "mod to wish value" + data[define end]
print(DTC_5814_removing)
else:
print("DTC non existant or incorrect")
Solved with
File_W = file.replace(switch_data.group(1), DTC_5814_removing)
File_WH = binascii.unhexlify(File_W)
filesave.write(File_WH)
filesave.close()

Unknown column added in user input form

I have a simple data entry form that writes the inputs to a csv file. Everything seems to be working ok, except that there are extra columns being added to the file in the process somewhere, seems to be during the user input phase. Here is the code:
import pandas as pd
#adds all spreadsheets into one list
Batteries= ["MAT0001.csv","MAT0002.csv", "MAT0003.csv", "MAT0004.csv",
"MAT0005.csv", "MAT0006.csv", "MAT0007.csv", "MAT0008.csv"]
#User selects battery to log
choice = (int(input("Which battery? (1-8):")))
def choosebattery(c):
done = False
while not done:
if(c in range(1,9)):
return Batteries[c]
done = True
else:
print('Sorry, selection must be between 1-8')
cfile = choosebattery(choice)
cbat = pd.read_csv(cfile)
#Collect Cycle input
print ("Enter Current Cycle")
response = None
while response not in {"Y", "N", "y", "n"}:
response = input("Please enter Y or N: ")
cy = response
#Charger input
print ("Enter Current Charger")
response = None
while response not in {"SC-G", "QS", "Bosca", "off", "other"}:
response = input("Please enter one: 'SC-G', 'QS', 'Bosca', 'off', 'other'")
if response == "other":
explain = input("Please explain")
ch = response + ":" + explain
else:
ch = response
#Location
print ("Enter Current Location")
response = None
while response not in {"Rack 1", "Rack 2", "Rack 3", "Rack 4", "EV001", "EV002", "EV003", "EV004", "Floor", "other"}:
response = input("Please enter one: 'Rack 1 - 4', 'EV001 - 004', 'Floor' or 'other'")
if response == "other":
explain = input("Please explain")
lo = response + ":" + explain
else:
lo = response
#Voltage
done = False
while not done:
choice = (float(input("Enter Current Voltage:")))
modchoice = choice * 10
if(modchoice in range(500,700)):
vo = choice
done = True
else:
print('Sorry, selection must be between 50 and 70')
#add inputs to current battery dataframe
log = pd.DataFrame([[cy,ch,lo,vo]],columns=["Cycle", "Charger", "Location", "Voltage"])
clog = pd.concat([cbat,log], axis=0)
clog.to_csv(cfile, index = False)
pd.read_csv(cfile)
And I receive:
Out[18]:
Charger Cycle Location Unnamed: 0 Voltage
0 off n Floor NaN 50.0
Where is the "Unnamed" column coming from?
There's an 'unnamed' column coming from your csv. The reason most likely is that the lines in your input csv files end with a comma (i.e. your separator), so pandas interprets that as an additional (nameless) column. If that's the case, check whether your lines end with your separator. For example, if your files are separated by commas:
Column1,Column2,Column3,
val_11, val12, val12,
...
Into:
Column1,Column2,Column3
val_11, val12, val12
...
Alternatively, try specifying the index column explicitly as in this answer. I believe some of the confusion stems from pandas concat reordering your columns .

Resources