Python script is locked when accessing SQLite database in loop - linux

please watch through the code of my parser. It grabs some statistics from web pages accessing them in a loop and puts specified records in SQLite3 database.
Everything is going right until the line 87 (the SQL statement), where the process consumes all CPU resources and in fact get blocked.
File "./parser.py", line 86, in
while (j < i):
Database file in the beginning of the code is created with correct structure, so the problem is in loops. Inner block of main loop for season in season_list: works just fine. Here is the whole code of my script:
#!/usr/bin/env python
from bs4 import BeautifulStoneSoup
from urllib2 import urlopen
import re
import sqlite3
from time import gmtime, strftime
# Print start time
print "We started at ", strftime("%Y-%m-%d %H:%M:%S", gmtime())
# Create DB
print "Trying to create DB"
con = sqlite3.connect('england.db')
cur = con.cursor()
sql = """\
CREATE TABLE english_premier_league (
id_match INTEGER PRIMARY KEY AUTOINCREMENT,
season TEXT,
tour INTEGER,
date TEXT,
home TEXT,
visitor TEXT,
home_score INTEGER,
visitor_score INTEGER
);
"""
try:
cur.executescript(sql)
except sqlite3.DatabaseError as err:
print "Error creating database: ", err
else:
print "Succesfully created your database..."
con.commit()
cur.close()
con.close()
# list of variables
postfix = 2011
threshold = 1999
season_list = []
while postfix >= threshold:
end = (postfix + 1) % 2000
if (end >= 10):
season = str(postfix) + str(end)
else:
season = str(postfix) + str(0) + str(end)
season_list.append(season)
postfix -= 1
print season_list
# main loop
for season in season_list:
href = 'http://www.stat-football.com/en/a/eng.php?b=10&d='+season+'&c=51'
print href
xml = urlopen(href).read()
xmlSoup = BeautifulStoneSoup(xml)
tablet = xmlSoup.find(attrs={"class" : "bd5"})
#Access DB
con = sqlite3.connect('england.db')
cur = con.cursor()
#Parse site
tour = tablet.findAll(attrs = { "class" : re.compile(r"^(s3|cc s3)$") })
date = tablet.findAll(text = re.compile(r"(0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[012])\.(19|20)\d\d"))
home = tablet.findAll(attrs = {"class" : "nw"})
guest = tablet.findAll(attrs = {"class" : "s1"})
score = tablet.findAll(attrs = {"class" : "nw pr15"})
#
def parse_string(sequence):
result=[]
for unit in sequence:
text = ''.join(unit.findAll(text=True))
result.append(text.strip())
return result
tour_list=parse_string(tour)
home_list=parse_string(home)
guest_list=parse_string(guest)
score_list=parse_string(score)
#Loop over found records to put them into sqlite3 DB
i = len(tour_list)
j = 0
while (j < i):
sql_add = 'INSERT INTO english_premier_league (season, tour, date, home, visitor, home_score, visitor_score) VALUES (?, ?, ?, ?, ?, ?, ?)'
match = (season, int(tour_list[j]), date[j], home_list[j], guest_list[j], int(score_list[j][0:1]), int(score_list[j][2:3]))
try:
cur.executemany(sql_add, match)
except sqlite3.DatabaseError as err:
print "Error matching the record: ", err
else:
con.commit()
part = float(j)/float(i)*100
if (part%10 == 0):
print (int(part)), "%"
j += 1
cur.close()
con.close()
Also it may be useful to look at the end of strace output:
getcwd("/home/vitaly/football_forecast/epl", 512) = 35
stat("/home/vitaly/football_forecast/epl/england.db",
{st_mode=S_IFREG|0644, st_size=24576, ...}) = 0
open("/home/vitaly/football_forecast/epl/england.db", O_RDWR|O_CREAT,
0644) = 3 fcntl(3, F_GETFD) = 0 fcntl(3,
F_SETFD, FD_CLOEXEC) = 0 fstat(3, {st_mode=S_IFREG|0644,
st_size=24576, ...}) = 0 lseek(3, 0, SEEK_SET) = 0
read(3, "SQLite format 3\0\4\0\1\1\0# \0\0\1~\0\0\0\30"..., 100) =
100
I'm running Python 2.7 on Ubuntu 12.04. Thanks a lot.

Replace cur.executemany(sql_add, match) with cur.execute(sql_add, match). executemany() is used for performing the same operation multiple times over an iterable of values. For example, if you had this:
match = [ (season1, tour1, date1, home1, visitor1, home_score1, visitor_score1),
(season2, tour2, date2, home2, visitor2, home_score2, visitor_score2),
(season3, tour3, date3, home3, visitor3, home_score3, visitor_score3) ]
cur.executemany(sql_add, match)
... it would be appropriate, since the cursor could iterate over the tuples in match and perform the insert operation on each of them.

Related

Сompare two faces using python3 module face_recognition?

sorry for my bad english.
I am trying to compare two faces using python3 module 'face_recognition'
here is an example of calculating euclidean distance in python
pdist([vector1, vector2], 'euclidean')
I want to calculate euclidean distance only in SQL query, because all faces(theirs vectors) will be stored in my database, but I do not know how to do this with a SQL query.
Information:
MariaDB version: 10.5.11
Python: 3.9.2
#!/usr/bin/env python3
import cv2
import face_recognition
import mysql.connector as mysql
def get_image_hash(image):
# Open image
img = face_recognition.load_image_file(image)
# Save as black
#img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Get vector
vector = face_recognition.face_encodings(img)[0]
vector = (str(vector),)
return vector
# Open DB
conn = mysql.connect(
host = '127.0.0.1',
user = 'user',
passwd = 'password'
)
cur = conn.cursor()
cur.execute("SHOW DATABASES")
# Check if db 'test' already exist
db_found = False
for db in cur:
if 'test' in db:
db_found = True
if not db_found:
cur.execute("CREATE DATABASE IF NOT EXISTS test;")
conn.commit()
cur.execute("USE test;")
cur.execute("""CREATE TABLE IF NOT EXISTS faces(id_face BIGINT PRIMARY KEY NOT NULL AUTO_INCREMENT, face_hash TEXT)""")
new_image = get_image_hash('test.jpg')
# Add face(array) in DB
cur.execute('''INSERT INTO faces (face_hash) VALUES(%s)''', new_image)
conn.commit()
# Upload a picture for search
find_me_image = get_image_hash('findme.jpg')
#print('d: ', find_me_image[0])
# How should i compare these two arrays in my SQL query to find a similar face?
cur.execute("SELECT * FROM faces WHERE ..... ;")
cur.close()
print('find_me_image: ', str(find_me_image))
print('new_image: ', str(new_image))
Result:
Find_me_image: ('[-0.04221933 0.04460172 0.10287622 -0.14319997 -0.13808066 0.00552465\n -0.04414323 -0.07157505 0.23200855 -0.12091423 0.16892464 -0.16992114\n -0.2487883 0.09141497 -0.14198568 0.1824664 -0.11484738 -0.1130986\n -0.14396232 -0.06075872 -0.00201617 0.07473749 -0.01706937 0.05610432\n -0.11021845 -0.30173326 -0.02712429 -0.10394925 -0.05155517 -0.21909578\n 0.03083897 0.16680503 -0.09715255 -0.0407755 -0.01714687 0.08432341\n -0.01913652 -0.13662203 0.21924476 0.04394831 -0.20848413 -0.03259828\n 0.04784738 0.30321479 0.22730266 -0.02372641 -0.01165112 -0.12765107\n 0.13877977 -0.3403039 0.0424962 0.10813272 0.0511388 0.12078771\n 0.04942191 -0.13038178 0.02736722 0.15339687 -0.24367541 0.10453884\n 0.13450858 -0.09997959 0.01744595 -0.10602434 0.2614505 0.10681546\n -0.12075276 -0.12065229 0.195976 -0.11606392 -0.0447496 0.08198876\n -0.13573587 -0.18409243 -0.19127932 0.01680213 0.35644779 0.16652581\n -0.12988403 -0.00341757 -0.15569599 -0.09128557 -0.03799717 0.09235845\n 0.06296059 -0.07972728 0.00744779 0.07452074 0.23394027 -0.0726112\n -0.00072305 0.2978259 -0.01452125 -0.06529554 -0.08694689 0.01903715\n -0.14941891 0.10714116 -0.1096215 0.00143995 0.00146057 0.00348109\n 0.06795555 0.10826397 -0.18627991 0.21965174 -0.04136307 -0.01491791\n 0.03774849 -0.07495191 -0.03808937 -0.02331351 0.29242265 -0.23740929\n 0.13265632 0.1274993 0.17672779 0.11845816 0.01477844 0.07670261\n 0.11437597 -0.03779818 -0.21296507 0.03480547 0.06180557 -0.01749492\n -0.023851 0.11586148]',)
New_image: ('[-0.04221933 0.04460172 0.10287622 -0.14319997 -0.13808066 0.00552465\n -0.04414323 -0.07157505 0.23200855 -0.12091423 0.16892464 -0.16992114\n -0.2487883 0.09141497 -0.14198568 0.1824664 -0.11484738 -0.1130986\n -0.14396232 -0.06075872 -0.00201617 0.07473749 -0.01706937 0.05610432\n -0.11021845 -0.30173326 -0.02712429 -0.10394925 -0.05155517 -0.21909578\n 0.03083897 0.16680503 -0.09715255 -0.0407755 -0.01714687 0.08432341\n -0.01913652 -0.13662203 0.21924476 0.04394831 -0.20848413 -0.03259828\n 0.04784738 0.30321479 0.22730266 -0.02372641 -0.01165112 -0.12765107\n 0.13877977 -0.3403039 0.0424962 0.10813272 0.0511388 0.12078771\n 0.04942191 -0.13038178 0.02736722 0.15339687 -0.24367541 0.10453884\n 0.13450858 -0.09997959 0.01744595 -0.10602434 0.2614505 0.10681546\n -0.12075276 -0.12065229 0.195976 -0.11606392 -0.0447496 0.08198876\n -0.13573587 -0.18409243 -0.19127932 0.01680213 0.35644779 0.16652581\n -0.12988403 -0.00341757 -0.15569599 -0.09128557 -0.03799717 0.09235845\n 0.06296059 -0.07972728 0.00744779 0.07452074 0.23394027 -0.0726112\n -0.00072305 0.2978259 -0.01452125 -0.06529554 -0.08694689 0.01903715\n -0.14941891 0.10714116 -0.1096215 0.00143995 0.00146057 0.00348109\n 0.06795555 0.10826397 -0.18627991 0.21965174 -0.04136307 -0.01491791\n 0.03774849 -0.07495191 -0.03808937 -0.02331351 0.29242265 -0.23740929\n 0.13265632 0.1274993 0.17672779 0.11845816 0.01477844 0.07670261\n 0.11437597 -0.03779818 -0.21296507 0.03480547 0.06180557 -0.01749492\n -0.023851 0.11586148]',)
New:
#!/usr/bin/env python3
import cv2
import json
import face_recognition
import mysql.connector as mysql
# DB
conn = mysql.connect(
host = 'localhost',
user = '',
passwd = ''
)
def load(str_data):
str_data = str_data.replace("[", "").replace("]", "")
result = []
for i, line in enumerate(str_data.split("\n")):
result.append([])
for element in line.replace(" ", " ").split(" "):
try:
result[i].append(float(element))
except ValueError:
pass
return result
def distance(model, test):
distance = 0
for i, line in enumerate(model):
dist_line = 0
for j, element in enumerate(line):
dist_line += (element - test[i][j]) ** 2
distance += dist_line ** 0.5
return distance
def get_image_hash(image):
# Open image
img = face_recognition.load_image_file(image)
# Save as black
#img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Get vector
vector = face_recognition.face_encodings(img)[0]
# We can save only iterable object
vector = (str(vector),)
return vector
cur = conn.cursor(buffered=True)
cur.execute("SHOW DATABASES")
# Check if db 'test' already exist
db_found = False
for db in cur:
if 'test' in db:
db_found = True
if not db_found:
cur.execute("CREATE DATABASE IF NOT EXISTS test;")
conn.commit()
cur.execute("USE test;")
cur.execute("""CREATE TABLE IF NOT EXISTS faces(id_face BIGINT PRIMARY KEY NOT NULL AUTO_INCREMENT, face_hash TEXT)""")
# Add face in DB
new_image = get_image_hash('test.jpg')
print('new_image debug: ', new_image)
cur.execute('''INSERT INTO faces (face_hash) VALUES(%s)''', new_image)
conn.commit()
# Find added face
find_me_image = get_image_hash('findme.jpg')
print('debug find_me_image: ', find_me_image)
# Get data from DB
cur.execute("SELECT * FROM faces;")
face_data = cur.fetchall()
# Check
for x in face_data:
print('1: ', load(find_me_image[0]))
print('2: ', load(x[1]))
# x[1] == row face_hash
compare_result = distance(load(find_me_image[0]), load(x[1]))
#print('Result: ', compare_result)
# Got error
'''
Traceback (most recent call last):
File "/home/user/Desktop/parser_steam/image_recognition/test/./test.py", line 102, in <module>
compare_result = distance(load(find_me_image[0]), load(x[1]))
File "/home/user/Desktop/parser_steam/image_recognition/test/./test.py", line 35, in distance
dist_line += (element - test[i][j]) ** 2
IndexError: list index out of range
'''
cur.close()
Error:
Here is what you need!
import json
def load(str_data):
str_data = str_data.replace("[", "").replace("]", "")
result = []
for i, line in enumerate(str_data.split("\n")):
result.append([])
for element in line.replace(" ", " ").split(" "):
try:
result[i].append(float(element))
except ValueError:
pass
return result
def distance(model, test):
distance = 0
for i, line in enumerate(model):
dist_line = 0
for j, element in enumerate(line):
dist_line += (element - test[i][j]) ** 2
distance += dist_line ** 0.5
return distance
if __name__ == "__main__":
Find_me_image = '[-0.04221933 0.04460172 0.10287622 -0.14319997 -0.13808066 0.00552465\n -0.04414323 -0.07157505 0.23200855 -0.12091423 0.16892464 -0.16992114\n -0.2487883 0.09141497 -0.14198568 0.1824664 -0.11484738 -0.1130986\n -0.14396232 -0.06075872 -0.00201617 0.07473749 -0.01706937 0.05610432\n -0.11021845 -0.30173326 -0.02712429 -0.10394925 -0.05155517 -0.21909578\n 0.03083897 0.16680503 -0.09715255 -0.0407755 -0.01714687 0.08432341\n -0.01913652 -0.13662203 0.21924476 0.04394831 -0.20848413 -0.03259828\n 0.04784738 0.30321479 0.22730266 -0.02372641 -0.01165112 -0.12765107\n 0.13877977 -0.3403039 0.0424962 0.10813272 0.0511388 0.12078771\n 0.04942191 -0.13038178 0.02736722 0.15339687 -0.24367541 0.10453884\n 0.13450858 -0.09997959 0.01744595 -0.10602434 0.2614505 0.10681546\n -0.12075276 -0.12065229 0.195976 -0.11606392 -0.0447496 0.08198876\n -0.13573587 -0.18409243 -0.19127932 0.01680213 0.35644779 0.16652581\n -0.12988403 -0.00341757 -0.15569599 -0.09128557 -0.03799717 0.09235845\n 0.06296059 -0.07972728 0.00744779 0.07452074 0.23394027 -0.0726112\n -0.00072305 0.2978259 -0.01452125 -0.06529554 -0.08694689 0.01903715\n -0.14941891 0.10714116 -0.1096215 0.00143995 0.00146057 0.00348109\n 0.06795555 0.10826397 -0.18627991 0.21965174 -0.04136307 -0.01491791\n 0.03774849 -0.07495191 -0.03808937 -0.02331351 0.29242265 -0.23740929\n 0.13265632 0.1274993 0.17672779 0.11845816 0.01477844 0.07670261\n 0.11437597 -0.03779818 -0.21296507 0.03480547 0.06180557 -0.01749492\n -0.023851 0.11586148]'
New_image = '[-0.04221933 0.04460172 0.10287622 -0.14319997 -0.13808064 0.00552465\n -0.04414323 -0.07157505 0.23200855 -0.12091423 0.16892464 -0.16992114\n -0.2487883 0.09141497 -0.14198568 0.18246 -0.11484738 -0.1130986\n -0.14396232 -0.06075872 -0.0020117 0.07473749 -0.01706937 0.05610432\n -0.11021845 -0.30173326 -0.02712429 -0.10394925 -0.05155517 -0.21909578\n 0.03083897 0.16680503 -0.09715255 -0.0407755 -0.01714687 0.08432341\n -0.01913652 -0.13662203 0.21924476 0.04394831 -0.20848413 -0.03259828\n 0.04784738 0.30321479 0.22730266 -0.02372641 -0.0116112 -0.12765107\n 0.13877977 -0.3403039 0.0424962 0.10813272 0.0511388 0.12078771\n 0.04942191 -0.13038178 0.02736722 0.15339687 -0.24367541 0.10453884\n 0.13450858 -0.09997959 0.01744595 -0.10602434 0.2614505 0.10681546\n -0.12075276 -0.12065229 0.195976 -0.11606392 -0.0447496 0.08198876\n -0.1357387 -0.18409243 -0.19127932 0.01680213 0.35644779 0.16652581\n -0.12988403 -0.00341757 -0.15569599 -0.09128557 -0.03799717 0.09235845\n 0.06296059 -0.07972728 0.00744779 0.07452074 0.23394027 -0.0726112\n -0.00072305 0.2978259 -0.01452125 -0.06529554 -0.08694689 0.0193715\n -0.14941891 0.10714116 -0.1096215 0.00143995 0.00146057 0.00348109\n 0.06795555 0.10826397 -0.18627991 0.21965174 -0.04136307 -0.01491791\n 0.03774849 -0.07495191 -0.03808937 -0.02331351 0.29242265 -0.23740929\n 0.13265632 0.1274993 0.1762779 0.11845816 0.01477844 0.07670261\n 0.11437597 -0.03779818 -0.21296507 0.03480547 0.0618057 -0.01749492\n -0.023851 0.1158648]'
print(distance(
load(Find_me_image),
load(New_image)
))
You need first to convert your datas using load function. Then calculate distance using distance function.
As your datas are the sames, I modify New_image datas to test the function.

how to fetch data stored in sqlite3 database and assign it to the variables in tkinter python

In the code below I saved value1 and value2 to the sqlite3 database and txt_ in folder named data.
What I am trying to achieve here is that when I rerun the programme and open the file, txt_ file should be open in the text area with the lines I added when I saved it. And when I click add button, value1 and value2 should be updated and newly created line should be in the next line.
Let me know if my method is correct, if not then please tell me the better one.
CODE:
from tkinter import *
from tkinter import messagebox
import sqlite3
import os
root = Tk()
root.geometry('400x400')
var_e = StringVar(None)
def create_my_db():
conn = sqlite3.connect(database=r'my db.db')
cur = conn.cursor()
cur.execute("""CREATE TABLE IF NOT EXISTS "myLogs"
(
"int_value" INTEGER,
"float_value" REAL
)
""")
conn.commit()
create_my_db()
def add_lbl():
global value1, value2
value1 += 1
value2 += 1
sample = f'This is line {value1} which has value of {value2}\n'
txt_.insert(END, sample)
def save():
conn = sqlite3.connect(database=r'my db.db')
cur = conn.cursor()
cur.execute("""INSERT INTO myLogs VALUES (?,?)""",
(
value1,
value2
)
)
conn.commit()
# labels to check if the values are stored in the database
values_lbl.config(text=f'value 1 is [ {value1} ] & value 2 is [ {value2} ]')
def save_txt():
file_txt = open(f'data/{value1}.txt', 'w')
file_txt.write(txt_.get(1.0, END))
file_txt.close()
messagebox.showinfo('SAVED', 'Data saved to the database.')
def open_():
for txt_file in os.listdir("data/"):
if txt_file.split('.')[0] == f'{var_e.get()}':
file_ = open(f"data/{txt_file}", "r")
for i in file_:
txt_.insert(END, i)
file_.close()
value1 = 0
value2 = 0.9
values_lbl = Label(root, text=f'value 1 is [ {value1} ] & value 2 is [ {value2} ]')
values_lbl.pack()
btn_frame = Frame(root)
btn_frame.pack()
btn_add = Button(btn_frame, text='Add', command=add_lbl)
btn_add.pack(side=LEFT)
e = Entry(btn_frame, textvariable=var_e)
e.pack(side=LEFT)
btn_open = Button(btn_frame, text='Open', command=open_)
btn_save = Button(btn_frame, text='Save', command=lambda:[save(), save_txt()])
btn_open.pack(side=LEFT)
btn_save.pack(side=LEFT)
txt_ = Text(root)
txt_.pack(fill=BOTH, expand=True)
root.mainloop()
When I posted this question I didn't know how to run the query to update value1 and value2, that's why I didn't mention the query in open_() function. Now I came to know how that query should have been done. So in the below code I added the query to open_() function. Now the complete programme runs fine.
def open_():
global value1, value2
txt_.delete(1.0, END)
for txt_file in os.listdir("data/"):
if txt_file.split('.')[0] == f'{var_e.get()}':
file_ = open(f"data/{txt_file}", "r")
for i in file_:
txt_.insert(END, i)
file_.close()
conn = sqlite3.connect(database=r'my db.db')
cur = conn.cursor()
cur.execute("""SELECT * FROM myLogs WHERE int_value=?""", (var_e.get(),))
row = cur.fetchone()
if row is None:
messagebox.showerror("ERROR", 'Invalid input.')
else:
value1 = row[0]
value2 = row[1]
conn.commit()

TypeError: 'NoneType' object is not iterable (Python3 with Oracle 19c)

Python 3.6.3 /
Oracle 19c
The following script runs fine till it hits upc_id, = cur.fetchone(). Could someone explain, please what may cause it? If I run the query in database, I get the result back (see below). Is there a way to see exactly what Oracle runs, after variable substitution? I suspect single quotes are not in place for bind variables, but how can I confirm?
import datetime
import cx_Oracle
import db
line_item_sku = 'Y35FLPQ_034Y_M'
x = line_item_sku.split("_")
print (x)
print ("Split-list len: "+ str(len(x)))
if len(x) == 3:
sku_with_dimension = False
elif len(x) == 4:
sku_with_dimension = True
print ("SKU with dimension: " + str(sku_with_dimension))
style_id = x[0]
color_id = x[1]
size_id = x[2]
if sku_with_dimension:
dimension_id = x[3]
print ("Style: "+style_id)
print ("Color: "+color_id)
print ("Size: "+size_id)
conn = db.connect('Galo')
print ("Connected to: " + conn.version)
cur = conn.cursor()
upc_id = cur.var(str)
print ("Assigned return value")
if sku_with_dimension:
sql = ("""
select upc_id
from sku
where business_unit_id = '81'
and style_id = :1
and color_id = :2
and identifier_id = 'EA'
and size_id = :3
and dimension_id = :4
""")
cur.execute(sql,(style_id, color_id, size_id, dimension_id))
else:
sql = ("""
select upc_id
from sku
where business_unit_id = '81'
and style_id = :1
and color_id = :2
and identifier_id = 'EA'
and size_id = :3
""")
cur.execute(sql,(style_id, color_id, size_id))
print ("Determined which query to run")
upc_id, = cur.fetchone()
print (upc_id)
db.disconnect(conn, cur)
Here is the output
'Y35FLPQ', '034Y', 'M']
Split-list len: 3
SKU with dimension: False
Style: Y35FLPQ
Color: 034Y
Size: M
Connected to: 19.0.0.0.0
Assigned return value
Determined which query to run
Traceback (most recent call last):
File "c:/Python/code/test.py", line 66, in <module>
upc_id, = cur.fetchone()
TypeError: 'NoneType' object is not iterable
If I run the query in database, I receive a result back:

Google Cloud Spanner Merge SQL Equivalent process in Python using Google API's

How to perform a Merge SQL as stated below in Google Cloud Spanner using Google API's?
MERGE INTO TABLE2 B
USING (SELECT COL1, COL2, SUM(TOTAL_CNT)
FROM TABLE1 GROUP BY COL1, COL2) A
ON (B.COL1=A.COL1 AND B.COL2 = A.COL2)
WHEN MATCHED THEN
UPDATE SET B.TOTAL_CNT = B.TOTAL_CNT + A.TOTAL_CNT)
WHEN NOT MATCHED THEN
INSERT (COL1, COL2, TOTAL_CNT)
VALUES (A.COL1.A.COL2,A.TOTAL_CNT)
I would say that you can use similar SQL clauses such as union and intersect to achieve your goal, this post elaborates on the goal. I think your approximation in your response using joins is also good.
Whenever you have to perform merge SQL, it needs to be broken down to 2 steps.
First step is to do a left join with Target Table and get the values you want and with the result set, we have to perform Batch Insert_or_update. This will save lot of look-ups and is more efficient. I've made the Batch Insert_or_update as multithreaded so that you can trigger more threads and process will finish quicker. If you don't need to be that fancy then you can make it as in-line code.
'''
import threading
import pandas as pd
import datetime
import time
from merge_ins_upd_using_df import merge_ins_upd_using_df
from google.cloud import spanner
# Instantiate a client.
spanner_client = spanner.Client()
# Your Cloud Spanner instance ID.
instance_id = 'spanner-instance'
# Get a Cloud Spanner instance by ID.
instance = spanner_client.instance(instance_id)
# Your Cloud Spanner database ID.
database_id = 'database-id'
max_thread_cnt = 30
threadLimiter = threading.BoundedSemaphore(max_thread_cnt)
thread_list = []
thread_count = 0
thread_cnt_before = 0
thread_counter = 0
sql_stmt = """ (SELECT A.COL1, A.COL2, SUM(A.TOTAL_CNT + COALESCE(B.TOTAL_CNT,0)) AS TOTAL_CNT
FROM (SELECT COL1, COL2, SUM(TOTAL_CNT) AS TOTAL_CNT
FROM TABLE1 GROUP BY COL1, COL2) A
LEFT JOIN TABLE2 B on (A.COL1 = B.COL1 AND A.COL2 = B.COL2) """
spanner_client = spanner.Client()
instance = spanner_client.instance(instance_id )
database = instance.database(database_id)
with database.snapshot() as snapshot:
results = snapshot.execute_sql(sql_stmt)
df = pd.DataFrame(results)
df.columns = ['COL1', 'COL2', 'TOTAL_CNT']
process_cnt = 10 # set this count based on the number of columns/index updates so that it wont go beyond 20,000 mutations limit
rec_cnt = df.shape[0]
print('Total Rec Count: ' + str(rec_cnt))
total_rec_processed = 0
from_index = 0
to_index = 0
dest_table = 'TABLE2'
### Build the threads
while True:
from_index = to_index
to_index = to_index + process_cnt
thread_counter = thread_counter + 1
if to_index > rec_cnt:
to_index = rec_cnt
df1 = df[from_index:to_index]
thread_count += 1
t = threading.Thread(target=merge_ins_upd_using_df,args=(instance_id, database_id, df1, thread_counter, dest_table))
thread_list.append(t)
total_rec_processed = total_rec_processed + process_cnt
# print("Threads Added: " + str(thread_count) + "Proc Count:" + str(total_rec_processed ))
if total_rec_processed >= rec_cnt:
break
begin = datetime.datetime.now()
print("Thread Kick-off has Started : " + str(begin))
print ("Thread Count before :" + str(threading.active_count()))
thread_cnt_before = threading.active_count()
# Starts threads
for thread in thread_list:
while threading.active_count() >= max_thread_cnt:
time.sleep(.05)
thread.start()
print ("Thread Count after :" + str(threading.active_count()))
print("All Threads have been kicked off : " + str(datetime.datetime.now()))
if thread_count > 0:
while threading.active_count() > thread_cnt_before:
time.sleep(2)
end = datetime.datetime.now()
diff = end-begin
print("Total time for completion in minutes : " + str(diff.total_seconds()/60))
####### function - merge_ins_upd_using_df
class merge_ins_upd_using_df:
def __init__(self, cs_instance, cs_database, df, thread_counter, dest_table):
self.cs_instance = cs_instance
self.cs_database = cs_database
self.thread_counter = thread_counter
self.df = df
self.dest_table = dest_table
from google.cloud import spanner
import datetime
begin = datetime.datetime.now()
spanner_client = spanner.Client()
instance = spanner_client.instance(cs_instance)
database = instance.database(cs_database)
with database.batch() as batch:
batch.insert_or_update(
table=dest_table, columns=df.columns,
values=df.values.tolist())
end = datetime.datetime.now()
diff = end-begin
### add logic to handle exceptions

Read records from CSV file and print report

I have been working on a program for a week now, but have been unable to get it to work according to the guidelines.
In this program (payroll.py), I have to open the CSV data file (employees.csv), read the records in the file, and produce a payroll report using the functions in payroll.py. The output should be printed, not written to a separate output file, and should end up looking like this:
LastName FirstName Hours RegHours OTHours RegPay OTPay GrossPay Deductions NetPay
Hightower Michael 42.0 40.0 2.0 400.00 30.00 430.00 107.07 322.93
Jackson Samuel 53.0 40.0 13.0 506.00 246.68 752.67 187.42 565.25
Jones Catherine 35.0 35.0 0.00 680.05 0.00 680.05 169.33 510.72
The payroll program works just fine on its own (without calling the CSV file), but when I try to call the file (using "from csv import reader"), one of two things happens:
1) I can call the first three columns (last name, first name, and hours), but I am unable to "insert" the additional columns (I get an index error because, of course, those columns don't exist in the original CSV file), or
2) The program only pulls up one entire record, which happens to be the last record in the CSV file.
Any guidance on how to accomplish this would be greatly appreciated. Thank you.
Here is the code for payroll.py:
def main() :
employeeFirstName, employeeLastName = employeeFullName()
employeePayRate, employeeHoursWorked = employeePay()
employeeRegularHours, employeeOvertimeHours = calculateRegularHours(employeeHoursWorked)
employeeOvertimeHours = calculateOvertimeHours(employeeHoursWorked)
employeeTotalHours = calculateTotalHours(employeeRegularHours, employeeOvertimeHours)
regularPayAmount = calculateRegularPay(employeePayRate, employeeRegularHours)
overtimePayAmount = calculateOvertimePay(employeePayRate, employeeOvertimeHours)
grossPayAmount = calculateGrossPay(regularPayAmount, overtimePayAmount)
federalTaxWithheld = calculateFederalTax(grossPayAmount)
stateTaxWithheld = calculateStateTax(grossPayAmount)
medicareTaxWithheld = calculateMedicareTax(grossPayAmount)
socSecTaxWithheld = calculateSocSecTax(grossPayAmount)
totalTaxesWithheld = calculateTotalTaxes(federalTaxWithheld, stateTaxWithheld, medicareTaxWithheld, socSecTaxWithheld)
netPayAmount = calculateNetPay(grossPayAmount, totalTaxesWithheld)
payrollSummaryReport(employeeFirstName, employeeLastName, employeePayRate, employeeRegularHours, employeeOvertimeHours, employeeTotalHours, regularPayAmount, overtimePayAmount, grossPayAmount, federalTaxWithheld, stateTaxWithheld, medicareTaxWithheld, socSecTaxWithheld, totalTaxesWithheld, netPayAmount)
def employeeFullName() :
employeeFirstName = str(input("Enter the employee's first name: "))
employeeLastName = str(input("Enter the employee's last name: "))
return employeeFirstName, employeeLastName
def employeePay() :
employeePayRate = float(input("Enter the employee's hourly pay rate: "))
employeeHoursWorked = float(input("Enter the employee's hours worked: "))
return employeePayRate, employeeHoursWorked
def calculateRegularHours(employeeHoursWorked) :
if employeeHoursWorked < 40 :
employeeRegularHours = employeeHoursWorked
employeeOvertimeHours = 0
else:
employeeRegularHours = 40
employeeOvertimeHours = employeeHoursWorked - 40
return employeeRegularHours, employeeOvertimeHours
def calculateOvertimeHours(employeeHoursWorked) :
if employeeHoursWorked > 40 :
employeeOvertimeHours = employeeHoursWorked - 40
else :
employeeOvertimeHours = 0
return employeeOvertimeHours
def calculateTotalHours(employeeRegularHours, employeeOvertimeHours) :
employeeTotalHours = employeeRegularHours + employeeOvertimeHours
return employeeTotalHours
def calculateRegularPay(employeePayRate, employeeHoursWorked) :
regularPayAmount = employeePayRate * employeeHoursWorked
return regularPayAmount
def calculateOvertimePay(employeePayRate, employeeOvertimeHours) :
overtimePayRate = 1.5
overtimePayAmount = (employeePayRate * employeeOvertimeHours) * overtimePayRate
return overtimePayAmount
def calculateGrossPay(regularPayAmount, overtimePayAmount) :
grossPayAmount = regularPayAmount + overtimePayAmount
return grossPayAmount
def calculateFederalTax(grossPayAmount) :
federalTaxRate = 0.124
federalTaxWithheld = grossPayAmount * federalTaxRate
return federalTaxWithheld
def calculateStateTax(grossPayAmount) :
stateTaxRate = 0.049
stateTaxWithheld = grossPayAmount * stateTaxRate
return stateTaxWithheld
def calculateMedicareTax(grossPayAmount) :
medicareTaxRate = 0.014
medicareTaxWithheld = grossPayAmount * medicareTaxRate
return medicareTaxWithheld
def calculateSocSecTax(grossPayAmount) :
socSecTaxRate = 0.062
socSecTaxWithheld = grossPayAmount * socSecTaxRate
return socSecTaxWithheld
def calculateTotalTaxes(federalTaxWithheld, stateTaxWithheld, medicareTaxWithheld, socSecTaxWithheld) :
totalTaxesWithheld = federalTaxWithheld + stateTaxWithheld + medicareTaxWithheld + socSecTaxWithheld
return totalTaxesWithheld
def calculateNetPay(grossPayAmount, totalTaxesWithheld) :
netPayAmount = grossPayAmount - totalTaxesWithheld
return netPayAmount
def payrollSummaryReport(employeeFirstName, employeeLastName, employeePayRate, employeeRegularHours, employeeOvertimeHours, employeeTotalHours, regularPayAmount, overtimePayAmount, grossPayAmount, federalTaxWithheld, stateTaxWithheld, medicareTaxWithheld, socSecTaxWithheld, totalTaxesWithheld, netPayAmount) :
print()
print("\t\t\t\t\t\tPayroll Summary Report")
print()
print("%-12s%-12s%-8s%-10s%-10s%-12s%-10s%-11s%-13s%-10s" % ("LastName", "FirstName", "Hours", "RegHours", "OTHours", "RegPay", "OTPay", "GrossPay", "Deductions", "NetPay"))
print("%-12s%-12s%-8.2f%-10.2f%-10.2f$%-11.2f$%-9.2f$%-10.2f$%-12.2f$%-10.2f" % (employeeLastName, employeeFirstName, employeeTotalHours, employeeRegularHours, employeeOvertimeHours, regularPayAmount, overtimePayAmount, grossPayAmount, totalTaxesWithheld, netPayAmount))
main ()
The CSV file (employees.csv) I need to use looks like this:
First,Last,Hours,Pay
Matthew,Hightower,42,10
Samuel,Jackson,53,12.65
Catherine,Jones,35,19.43
Charlton,Heston,52,10
Karen,Black,40,12
Sid,Caesar,38,15
George,Kennedy,25,35
Linda,Blair,42,18.6
Beverly,Garland,63,10
Jerry,Stiller,52,15
Efrem,Zimbalist,34,16
Linda,Harrison,24,14
Erik,Estrada,41,15.5
Myrna,Loy,40,14.23
You can treat your .csv file as a regular one. No need for reader. Here is a function that might deal with your file:
def get_data(fname):
'''
Function returns the dictionary with following
format:
{ 0 : {
"fname": "...",
"lname": "...",
"gross": "...",
},
1 : {
....,
,,,,
},
}
'''
result = {} # return value
i = 0 # you can zip range() if you want to
with open(fname, 'r') as f:
for line in f.readlines()[1:]:
result[i] = {}
tmp = line.split(",") # list of values from file
# access file values by their index, e.g.
# tmp[0] -> first name
# tmp[1] -> last name
# tmp[2] -> hours
# tmp[3] -> pay rate
# do calculations using your functions (calculateOvertimePay,
# calculateTotalHours, etc.) and store the results in dictionary
# e.g:
result[i]["fname"] = tmp[0]
result[i]["lname"] = tmp[1]
# ...
# do calculations for report
# ...
# result[i]["regular"] = calc...(....)
# result[i]["overtime"] = calc...(....)
result[i]["gross"] = calculateGrossPay(result[i]["regular"], result[i]["overtime"])
i += 1
return result
There are several thing your might want to do with your payrollSummaryReport(...) function to improve it:
replace your huge argument list with dict, or list
tinker it a bit to fit your requirements
Your might do your improvements in this way:
def payrollSummaryReport(vals) :
print()
print("\t\t\t\t\t\tPayroll Summary Report")
print()
print("%-12s%-12s%-8s%-10s%-10s%-12s%-10s%-11s%-13s%-10s" %\
("LastName", "FirstName", "Hours", "RegHours", "OTHours", "RegPay", "OTPay", "GrossPay", "Deductions", "NetPay"))
for i in vals:
print("%-12s%-12s%-8.2f%-10.2f%-10.2f$%-11.2f$%-9.2f$%-10.2f$%-12.2f$%-10.2f" %\
(vals[i]["fname"], vals[i]["lname"], vals[i]["gross"], ''' repeat for all fields '''))

Resources