Google Cloud Spanner Merge SQL Equivalent process in Python using Google API's - google-cloud-spanner

How to perform a Merge SQL as stated below in Google Cloud Spanner using Google API's?
MERGE INTO TABLE2 B
USING (SELECT COL1, COL2, SUM(TOTAL_CNT)
FROM TABLE1 GROUP BY COL1, COL2) A
ON (B.COL1=A.COL1 AND B.COL2 = A.COL2)
WHEN MATCHED THEN
UPDATE SET B.TOTAL_CNT = B.TOTAL_CNT + A.TOTAL_CNT)
WHEN NOT MATCHED THEN
INSERT (COL1, COL2, TOTAL_CNT)
VALUES (A.COL1.A.COL2,A.TOTAL_CNT)

I would say that you can use similar SQL clauses such as union and intersect to achieve your goal, this post elaborates on the goal. I think your approximation in your response using joins is also good.

Whenever you have to perform merge SQL, it needs to be broken down to 2 steps.
First step is to do a left join with Target Table and get the values you want and with the result set, we have to perform Batch Insert_or_update. This will save lot of look-ups and is more efficient. I've made the Batch Insert_or_update as multithreaded so that you can trigger more threads and process will finish quicker. If you don't need to be that fancy then you can make it as in-line code.
'''
import threading
import pandas as pd
import datetime
import time
from merge_ins_upd_using_df import merge_ins_upd_using_df
from google.cloud import spanner
# Instantiate a client.
spanner_client = spanner.Client()
# Your Cloud Spanner instance ID.
instance_id = 'spanner-instance'
# Get a Cloud Spanner instance by ID.
instance = spanner_client.instance(instance_id)
# Your Cloud Spanner database ID.
database_id = 'database-id'
max_thread_cnt = 30
threadLimiter = threading.BoundedSemaphore(max_thread_cnt)
thread_list = []
thread_count = 0
thread_cnt_before = 0
thread_counter = 0
sql_stmt = """ (SELECT A.COL1, A.COL2, SUM(A.TOTAL_CNT + COALESCE(B.TOTAL_CNT,0)) AS TOTAL_CNT
FROM (SELECT COL1, COL2, SUM(TOTAL_CNT) AS TOTAL_CNT
FROM TABLE1 GROUP BY COL1, COL2) A
LEFT JOIN TABLE2 B on (A.COL1 = B.COL1 AND A.COL2 = B.COL2) """
spanner_client = spanner.Client()
instance = spanner_client.instance(instance_id )
database = instance.database(database_id)
with database.snapshot() as snapshot:
results = snapshot.execute_sql(sql_stmt)
df = pd.DataFrame(results)
df.columns = ['COL1', 'COL2', 'TOTAL_CNT']
process_cnt = 10 # set this count based on the number of columns/index updates so that it wont go beyond 20,000 mutations limit
rec_cnt = df.shape[0]
print('Total Rec Count: ' + str(rec_cnt))
total_rec_processed = 0
from_index = 0
to_index = 0
dest_table = 'TABLE2'
### Build the threads
while True:
from_index = to_index
to_index = to_index + process_cnt
thread_counter = thread_counter + 1
if to_index > rec_cnt:
to_index = rec_cnt
df1 = df[from_index:to_index]
thread_count += 1
t = threading.Thread(target=merge_ins_upd_using_df,args=(instance_id, database_id, df1, thread_counter, dest_table))
thread_list.append(t)
total_rec_processed = total_rec_processed + process_cnt
# print("Threads Added: " + str(thread_count) + "Proc Count:" + str(total_rec_processed ))
if total_rec_processed >= rec_cnt:
break
begin = datetime.datetime.now()
print("Thread Kick-off has Started : " + str(begin))
print ("Thread Count before :" + str(threading.active_count()))
thread_cnt_before = threading.active_count()
# Starts threads
for thread in thread_list:
while threading.active_count() >= max_thread_cnt:
time.sleep(.05)
thread.start()
print ("Thread Count after :" + str(threading.active_count()))
print("All Threads have been kicked off : " + str(datetime.datetime.now()))
if thread_count > 0:
while threading.active_count() > thread_cnt_before:
time.sleep(2)
end = datetime.datetime.now()
diff = end-begin
print("Total time for completion in minutes : " + str(diff.total_seconds()/60))
####### function - merge_ins_upd_using_df
class merge_ins_upd_using_df:
def __init__(self, cs_instance, cs_database, df, thread_counter, dest_table):
self.cs_instance = cs_instance
self.cs_database = cs_database
self.thread_counter = thread_counter
self.df = df
self.dest_table = dest_table
from google.cloud import spanner
import datetime
begin = datetime.datetime.now()
spanner_client = spanner.Client()
instance = spanner_client.instance(cs_instance)
database = instance.database(cs_database)
with database.batch() as batch:
batch.insert_or_update(
table=dest_table, columns=df.columns,
values=df.values.tolist())
end = datetime.datetime.now()
diff = end-begin
### add logic to handle exceptions

Related

How to dynamically create kafka producers

first I am doing baby steps in python and kafka, So let's say I have a listA=[item1, item2, item3] and every item of listA is a producer on a topic. Now what I want is to dynamically add/remove items to listA and became immediately producers also every item should run on it's own thread as they should be independent.
So basically I am trying to scale the application.
so far I tried to hard code every producer item and run it in its own terminal
each Item
from pykafka import KafkaClient
import json
from datetime import datetime
import uuid
import time
input_file = open('./data/item1.json')
json_array = json.load(input_file)
coordinates = json_array['features'][0]['geometry']['coordinates']
# Generate uuid
def generate_uuid():
return uuid.uuid4()
# Kafaka producer
client = KafkaClient(hosts="localhost:9092")
topic = client.topics['test_kafka2']
producer = topic.get_sync_producer()
# Generate all coordinates
def generate_coordinates(coordinates):
# new_coordinates = []
i = 0
while i < len(coordinates):
data = {}
data['class'] = 201
data['key'] = str(data['class']) + '_' + str(generate_uuid())
data['time_stamp'] = str(datetime.utcnow())
data['longitude'] = coordinates[i][0]
data['latitude'] = coordinates[i][1]
message = json.dumps(data)
producer.produce(message.encode('ascii'))
time.sleep(1)
# If item reaches last coordinaates
if i == len(coordinates)-1:
coordinates = coordinates[::-1]
i = 0
else:
i += 1
# return new_coordinates
generate_coordinates(coordinates)

caching pyspark dataframes leads to no performance gain

I am trying to make my scripts more efficient.
At the moment, I have 10 scripts - they all read data in, process it and output it.
They all read from the same main DB tables though and just do different things with the data.
So I have consolidated to one script, with the idea that I only read data once, rather than 10 times.
Should that not result in a faster execution? Because it doesn't.
Below is an example of the structure I am using
Any help would be amazing
Thanks
'''
TABLE DEFINITION AND CACHING
'''
spark_session = create_session('usage CF')
usage_logs = spark_session.sql("Select * from db.table where dt = " + yday_date ).cache()
user_logs = spark_session.sql("Select * from db2.table2 where dt = " + yday_date ).cache()
usercat = spark_session.sql("Select * from db3.table3 where dt = " + yday_date ).cache()
radius_logs = spark_session.sql("Select * from db.table4 where dt = " + yday_date )
radius = radius_logs.select('emsisdn2', 'sessionid2', 'custavp1').cache()
'''
usage CF
'''
usage = usage_logs.select('field1', 'field2', 'field3')
conditions = [usage.sid == radius.sessionid2]
df3 = usage.join(radius, conditions, how='left')
df4 = df3.groupBy('field1', 'field2').agg(sqlfunc.sum('field3').alias('bytesdl'))
usage = df4.createOrReplaceTempView('usage')
usage_table_output = spark_session.sql(' insert overwrite table outputdb.outputtbl partition(dt = ' + yday_date + ') select "usage" as type, * from usage ')
'''
user CF
'''
user = usage_logs.filter((usage_logs.vslsessid == '0')).select('field1', 'field2', 'field3', 'field4')
conditionsx = [user.sessionid == radius.sessionid2]
user_joined = user.join(radius, conditionsx, how='left')
user_output = user_joined.groupBy('field1', 'field2', 'field3').agg(sqlfunc.sum('field4').alias('bytesdl'))
user = user_output.createOrReplaceTempView('user')
user_table_output = spark_session.sql(' insert overwrite table outputdb.outputtbl2 partition(dt = ' + yday_date + ') select "user" as type, * from user')

Bigquery CSV file load fail

google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error
import csv
#Imports the Google Cloud BigQuery client library
from google.cloud import bigquery
from google.cloud.bigquery import Dataset
from google.cloud.bigquery import Table
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import SchemaField
filename = 'events.csv'
idNeeded=0
#Instantiates a client
bigquery_client = bigquery.Client()
#Runs a query from BigQuery
def runBigQueryQuery( query, filename, idNeeded ):
if idNeeded == 1:
i = 1
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([i,row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
i = i+1
except AttributeError as error:
print('An error occured: {0}'.format(error))
else:
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
except AttributeError as error:
print('An error occured: {0}'.format(error))
return
#Creates a dataset in BigQuery
def createDataset(datasetname):
dataset_ref = bigquery_client.dataset(datasetname)
dataset = Dataset(dataset_ref)
dataset.location = 'US'
dataset = bigquery_client.create_dataset(dataset)
return
def getDataset(datasetname):
dataset = bigquery_client.dataset(datasetname)
return dataset
def createTable(tablename, global_dataset_ref):
schema = [
#Enter Schema here.
# SchemaField('url', 'STRING', mode='required'),
# SchemaField('views', 'INTEGER', mode='required')
]
table_ref = global_dataset_ref.table(tablename)
table = Table(table_ref, schema=schema)
table = bigquery_client.create_table(table)
assert table.table_id == tablename
return
def getTable(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
# print(table.table_id)
print(table.schema)
# print(table.description)
# print(table.num_rows)
return table
def getTableSchema(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
schema = table.schema
return schema
def loadDataFromCSV(tablename, global_dataset_ref, filename):
schema = getTableSchema(tablename, global_dataset_ref)
table_ref = global_dataset_ref.table(tablename)
load_config = LoadJobConfig()
load_config.source_format = bigquery.SourceFormat.CSV
load_config.schema = schema
load_config.autodetect = True
load_config.allow_quoted_newlines = True
with open (filename, 'rb') as readable:
job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id))
return
# Testing
if __name__ == '__main__':
datasetname = 'Data_Layer'
tablename = 'Events'
sqlquery = '''SELECT
null as EventId,
sc.scheduleid AS ScheduleId,
NULL AS Description,
sc.scheduledatefrom AS Date,
sc.timestart AS TimeFrom,
sc.timeduration AS TimeTo,
r.resourceid AS ResourceId,
sp.employeeid AS EmployeeId,
NULL AS MovementTypeId,
r.configheight AS Capacity,
CASE
WHEN st.schedulestatus IN (1, 3) THEN '1'
ELSE '0'
END CanBook,
CASE
WHEN sv.nonmembermayenroll = TRUE THEN '1'
ELSE '0'
END NonMemberFlag,
COALESCE(ProgramPrice.pricemember,
ServicePrice.pricemember,
0) AS MemberAmount,
COALESCE(ProgramPrice.pricenonmember,
ServicePrice.pricenonmember,
0) AS NonMemberAmount,
'N/A' AS Attendance
FROM
AloomaTest.SCSESSIONS s
LEFT JOIN
AloomaTest.SCSESSION_PROVIDERS sp
ON
sp.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSESSION_RESOURCES sr
ON
sr.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSCHEDULES sc
ON
sc.scheduleid = s.scheduleid
LEFT JOIN
AloomaTest._SCSCHEDULESTATUS ST
ON
ST.schedulestatus = sc.schedulestatus
LEFT JOIN
AloomaTest.SCRESOURCES r
ON
r.resourceid = sr.resourceid
LEFT JOIN
AloomaTest.SCSERVICES sv
ON
sv.serviceid = sc.serviceid
LEFT JOIN
AloomaTest.SCPROGREG_SEMCOURSES semc
ON
semc.serviceid = sc.serviceid
AND semc.semesterid = sc.semesterid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ProgramPrice
ON
ProgramPrice.scheduleid = sc.scheduleid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ServicePrice
ON
ServicePrice.semcourseid = semc.semcourseid
WHERE
COALESCE(ProgramPrice.feetypeid,
0) = 0
AND COALESCE(ServicePrice.feetypeid,
0)= 0
and sc.scheduleid in(31207,
25936,
5761094,
832794,
9825,
17912)
'''
#createDataset(datasetname) #Successfully tested this code 2018-09-24
global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24
#createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24
loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24
sample data
,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A
,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A
,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A
,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A
The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.

Select parquet based on partition date

I've some heavy logs on my cluster, I've parqueted all of them with the following partition schema:
PARTITION_YEAR=2017/PARTITION_MONTH=07/PARTITION_DAY=12
For example, if I want to select all my log between 2017/07/12 and 2017/08/10 is there a way to do it effectively ? Or Do I have to loop over all days to read the partitions one by one ?
Thanks,
You can use some regular expressions when loading files in pyspark :
input_path = "PARTITION_YEAR=2017/PARTITION_MONTH=0{7/PARTITION_DAY={1[2-9],[2-3]*},8/PARTITION_DAY={0[1-9],10}}"
df = spark.read.parquet(input_path)
You can also generate a list of comma separated paths:
input_path = ",".join(["PARTITION_YEAR=2017/PARTITION_MONTH=07/PARTITION_DAY=" + str(x) for x in range(12, 32)]) \
+ ",".join(["PARTITION_YEAR=2017/PARTITION_MONTH=08/PARTITION_DAY=" + str(x) for x in range(1, 11)])
or using dates:
import datetime as dt
d1 = dt.date(2017,7,12)
d2 = dt.date(2017,8,10)
date_list = [d1 + dt.timedelta(days=x) for x in range(0, (d2 - d1).days + 1)]
input_path = ",".join(["PARTITION_YEAR=2017/PARTITION_MONTH=%02d/PARTITION_DAY=%02d" % (d.month, d.day) for d in date_list])

Python script is locked when accessing SQLite database in loop

please watch through the code of my parser. It grabs some statistics from web pages accessing them in a loop and puts specified records in SQLite3 database.
Everything is going right until the line 87 (the SQL statement), where the process consumes all CPU resources and in fact get blocked.
File "./parser.py", line 86, in
while (j < i):
Database file in the beginning of the code is created with correct structure, so the problem is in loops. Inner block of main loop for season in season_list: works just fine. Here is the whole code of my script:
#!/usr/bin/env python
from bs4 import BeautifulStoneSoup
from urllib2 import urlopen
import re
import sqlite3
from time import gmtime, strftime
# Print start time
print "We started at ", strftime("%Y-%m-%d %H:%M:%S", gmtime())
# Create DB
print "Trying to create DB"
con = sqlite3.connect('england.db')
cur = con.cursor()
sql = """\
CREATE TABLE english_premier_league (
id_match INTEGER PRIMARY KEY AUTOINCREMENT,
season TEXT,
tour INTEGER,
date TEXT,
home TEXT,
visitor TEXT,
home_score INTEGER,
visitor_score INTEGER
);
"""
try:
cur.executescript(sql)
except sqlite3.DatabaseError as err:
print "Error creating database: ", err
else:
print "Succesfully created your database..."
con.commit()
cur.close()
con.close()
# list of variables
postfix = 2011
threshold = 1999
season_list = []
while postfix >= threshold:
end = (postfix + 1) % 2000
if (end >= 10):
season = str(postfix) + str(end)
else:
season = str(postfix) + str(0) + str(end)
season_list.append(season)
postfix -= 1
print season_list
# main loop
for season in season_list:
href = 'http://www.stat-football.com/en/a/eng.php?b=10&d='+season+'&c=51'
print href
xml = urlopen(href).read()
xmlSoup = BeautifulStoneSoup(xml)
tablet = xmlSoup.find(attrs={"class" : "bd5"})
#Access DB
con = sqlite3.connect('england.db')
cur = con.cursor()
#Parse site
tour = tablet.findAll(attrs = { "class" : re.compile(r"^(s3|cc s3)$") })
date = tablet.findAll(text = re.compile(r"(0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[012])\.(19|20)\d\d"))
home = tablet.findAll(attrs = {"class" : "nw"})
guest = tablet.findAll(attrs = {"class" : "s1"})
score = tablet.findAll(attrs = {"class" : "nw pr15"})
#
def parse_string(sequence):
result=[]
for unit in sequence:
text = ''.join(unit.findAll(text=True))
result.append(text.strip())
return result
tour_list=parse_string(tour)
home_list=parse_string(home)
guest_list=parse_string(guest)
score_list=parse_string(score)
#Loop over found records to put them into sqlite3 DB
i = len(tour_list)
j = 0
while (j < i):
sql_add = 'INSERT INTO english_premier_league (season, tour, date, home, visitor, home_score, visitor_score) VALUES (?, ?, ?, ?, ?, ?, ?)'
match = (season, int(tour_list[j]), date[j], home_list[j], guest_list[j], int(score_list[j][0:1]), int(score_list[j][2:3]))
try:
cur.executemany(sql_add, match)
except sqlite3.DatabaseError as err:
print "Error matching the record: ", err
else:
con.commit()
part = float(j)/float(i)*100
if (part%10 == 0):
print (int(part)), "%"
j += 1
cur.close()
con.close()
Also it may be useful to look at the end of strace output:
getcwd("/home/vitaly/football_forecast/epl", 512) = 35
stat("/home/vitaly/football_forecast/epl/england.db",
{st_mode=S_IFREG|0644, st_size=24576, ...}) = 0
open("/home/vitaly/football_forecast/epl/england.db", O_RDWR|O_CREAT,
0644) = 3 fcntl(3, F_GETFD) = 0 fcntl(3,
F_SETFD, FD_CLOEXEC) = 0 fstat(3, {st_mode=S_IFREG|0644,
st_size=24576, ...}) = 0 lseek(3, 0, SEEK_SET) = 0
read(3, "SQLite format 3\0\4\0\1\1\0# \0\0\1~\0\0\0\30"..., 100) =
100
I'm running Python 2.7 on Ubuntu 12.04. Thanks a lot.
Replace cur.executemany(sql_add, match) with cur.execute(sql_add, match). executemany() is used for performing the same operation multiple times over an iterable of values. For example, if you had this:
match = [ (season1, tour1, date1, home1, visitor1, home_score1, visitor_score1),
(season2, tour2, date2, home2, visitor2, home_score2, visitor_score2),
(season3, tour3, date3, home3, visitor3, home_score3, visitor_score3) ]
cur.executemany(sql_add, match)
... it would be appropriate, since the cursor could iterate over the tuples in match and perform the insert operation on each of them.

Resources