Google Cloud Spanner Merge SQL Equivalent process in Python using Google API's - google-cloud-spanner
How to perform a Merge SQL as stated below in Google Cloud Spanner using Google API's?
MERGE INTO TABLE2 B
USING (SELECT COL1, COL2, SUM(TOTAL_CNT)
FROM TABLE1 GROUP BY COL1, COL2) A
ON (B.COL1=A.COL1 AND B.COL2 = A.COL2)
WHEN MATCHED THEN
UPDATE SET B.TOTAL_CNT = B.TOTAL_CNT + A.TOTAL_CNT)
WHEN NOT MATCHED THEN
INSERT (COL1, COL2, TOTAL_CNT)
VALUES (A.COL1.A.COL2,A.TOTAL_CNT)
I would say that you can use similar SQL clauses such as union and intersect to achieve your goal, this post elaborates on the goal. I think your approximation in your response using joins is also good.
Whenever you have to perform merge SQL, it needs to be broken down to 2 steps.
First step is to do a left join with Target Table and get the values you want and with the result set, we have to perform Batch Insert_or_update. This will save lot of look-ups and is more efficient. I've made the Batch Insert_or_update as multithreaded so that you can trigger more threads and process will finish quicker. If you don't need to be that fancy then you can make it as in-line code.
'''
import threading
import pandas as pd
import datetime
import time
from merge_ins_upd_using_df import merge_ins_upd_using_df
from google.cloud import spanner
# Instantiate a client.
spanner_client = spanner.Client()
# Your Cloud Spanner instance ID.
instance_id = 'spanner-instance'
# Get a Cloud Spanner instance by ID.
instance = spanner_client.instance(instance_id)
# Your Cloud Spanner database ID.
database_id = 'database-id'
max_thread_cnt = 30
threadLimiter = threading.BoundedSemaphore(max_thread_cnt)
thread_list = []
thread_count = 0
thread_cnt_before = 0
thread_counter = 0
sql_stmt = """ (SELECT A.COL1, A.COL2, SUM(A.TOTAL_CNT + COALESCE(B.TOTAL_CNT,0)) AS TOTAL_CNT
FROM (SELECT COL1, COL2, SUM(TOTAL_CNT) AS TOTAL_CNT
FROM TABLE1 GROUP BY COL1, COL2) A
LEFT JOIN TABLE2 B on (A.COL1 = B.COL1 AND A.COL2 = B.COL2) """
spanner_client = spanner.Client()
instance = spanner_client.instance(instance_id )
database = instance.database(database_id)
with database.snapshot() as snapshot:
results = snapshot.execute_sql(sql_stmt)
df = pd.DataFrame(results)
df.columns = ['COL1', 'COL2', 'TOTAL_CNT']
process_cnt = 10 # set this count based on the number of columns/index updates so that it wont go beyond 20,000 mutations limit
rec_cnt = df.shape[0]
print('Total Rec Count: ' + str(rec_cnt))
total_rec_processed = 0
from_index = 0
to_index = 0
dest_table = 'TABLE2'
### Build the threads
while True:
from_index = to_index
to_index = to_index + process_cnt
thread_counter = thread_counter + 1
if to_index > rec_cnt:
to_index = rec_cnt
df1 = df[from_index:to_index]
thread_count += 1
t = threading.Thread(target=merge_ins_upd_using_df,args=(instance_id, database_id, df1, thread_counter, dest_table))
thread_list.append(t)
total_rec_processed = total_rec_processed + process_cnt
# print("Threads Added: " + str(thread_count) + "Proc Count:" + str(total_rec_processed ))
if total_rec_processed >= rec_cnt:
break
begin = datetime.datetime.now()
print("Thread Kick-off has Started : " + str(begin))
print ("Thread Count before :" + str(threading.active_count()))
thread_cnt_before = threading.active_count()
# Starts threads
for thread in thread_list:
while threading.active_count() >= max_thread_cnt:
time.sleep(.05)
thread.start()
print ("Thread Count after :" + str(threading.active_count()))
print("All Threads have been kicked off : " + str(datetime.datetime.now()))
if thread_count > 0:
while threading.active_count() > thread_cnt_before:
time.sleep(2)
end = datetime.datetime.now()
diff = end-begin
print("Total time for completion in minutes : " + str(diff.total_seconds()/60))
####### function - merge_ins_upd_using_df
class merge_ins_upd_using_df:
def __init__(self, cs_instance, cs_database, df, thread_counter, dest_table):
self.cs_instance = cs_instance
self.cs_database = cs_database
self.thread_counter = thread_counter
self.df = df
self.dest_table = dest_table
from google.cloud import spanner
import datetime
begin = datetime.datetime.now()
spanner_client = spanner.Client()
instance = spanner_client.instance(cs_instance)
database = instance.database(cs_database)
with database.batch() as batch:
batch.insert_or_update(
table=dest_table, columns=df.columns,
values=df.values.tolist())
end = datetime.datetime.now()
diff = end-begin
### add logic to handle exceptions
Related
How to dynamically create kafka producers
first I am doing baby steps in python and kafka, So let's say I have a listA=[item1, item2, item3] and every item of listA is a producer on a topic. Now what I want is to dynamically add/remove items to listA and became immediately producers also every item should run on it's own thread as they should be independent. So basically I am trying to scale the application. so far I tried to hard code every producer item and run it in its own terminal each Item from pykafka import KafkaClient import json from datetime import datetime import uuid import time input_file = open('./data/item1.json') json_array = json.load(input_file) coordinates = json_array['features'][0]['geometry']['coordinates'] # Generate uuid def generate_uuid(): return uuid.uuid4() # Kafaka producer client = KafkaClient(hosts="localhost:9092") topic = client.topics['test_kafka2'] producer = topic.get_sync_producer() # Generate all coordinates def generate_coordinates(coordinates): # new_coordinates = [] i = 0 while i < len(coordinates): data = {} data['class'] = 201 data['key'] = str(data['class']) + '_' + str(generate_uuid()) data['time_stamp'] = str(datetime.utcnow()) data['longitude'] = coordinates[i][0] data['latitude'] = coordinates[i][1] message = json.dumps(data) producer.produce(message.encode('ascii')) time.sleep(1) # If item reaches last coordinaates if i == len(coordinates)-1: coordinates = coordinates[::-1] i = 0 else: i += 1 # return new_coordinates generate_coordinates(coordinates)
caching pyspark dataframes leads to no performance gain
I am trying to make my scripts more efficient. At the moment, I have 10 scripts - they all read data in, process it and output it. They all read from the same main DB tables though and just do different things with the data. So I have consolidated to one script, with the idea that I only read data once, rather than 10 times. Should that not result in a faster execution? Because it doesn't. Below is an example of the structure I am using Any help would be amazing Thanks ''' TABLE DEFINITION AND CACHING ''' spark_session = create_session('usage CF') usage_logs = spark_session.sql("Select * from db.table where dt = " + yday_date ).cache() user_logs = spark_session.sql("Select * from db2.table2 where dt = " + yday_date ).cache() usercat = spark_session.sql("Select * from db3.table3 where dt = " + yday_date ).cache() radius_logs = spark_session.sql("Select * from db.table4 where dt = " + yday_date ) radius = radius_logs.select('emsisdn2', 'sessionid2', 'custavp1').cache() ''' usage CF ''' usage = usage_logs.select('field1', 'field2', 'field3') conditions = [usage.sid == radius.sessionid2] df3 = usage.join(radius, conditions, how='left') df4 = df3.groupBy('field1', 'field2').agg(sqlfunc.sum('field3').alias('bytesdl')) usage = df4.createOrReplaceTempView('usage') usage_table_output = spark_session.sql(' insert overwrite table outputdb.outputtbl partition(dt = ' + yday_date + ') select "usage" as type, * from usage ') ''' user CF ''' user = usage_logs.filter((usage_logs.vslsessid == '0')).select('field1', 'field2', 'field3', 'field4') conditionsx = [user.sessionid == radius.sessionid2] user_joined = user.join(radius, conditionsx, how='left') user_output = user_joined.groupBy('field1', 'field2', 'field3').agg(sqlfunc.sum('field4').alias('bytesdl')) user = user_output.createOrReplaceTempView('user') user_table_output = spark_session.sql(' insert overwrite table outputdb.outputtbl2 partition(dt = ' + yday_date + ') select "user" as type, * from user')
Bigquery CSV file load fail
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details. I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error import csv #Imports the Google Cloud BigQuery client library from google.cloud import bigquery from google.cloud.bigquery import Dataset from google.cloud.bigquery import Table from google.cloud.bigquery import LoadJobConfig from google.cloud.bigquery import SchemaField filename = 'events.csv' idNeeded=0 #Instantiates a client bigquery_client = bigquery.Client() #Runs a query from BigQuery def runBigQueryQuery( query, filename, idNeeded ): if idNeeded == 1: i = 1 query_job = bigquery_client.query(query) results = query_job.result() with open (filename, 'w', newline='') as f: #Create CSV file write = csv.writer(f,dialect='excel',lineterminator='\n') try: for row in results: print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId, row.ScheduleId, row.Date, row.TimeFrom, row.Description, row.TimeTo, row.ResourceId, row.EmployeeId, row.MovementTypeId, row.Capacity, row.CanBook, row.NonMemberFlag, row.MemberAmount, row.NonMemberAmount, row.Attendance)) write.writerow([i,row.EventId, row.ScheduleId, row.Date, row.TimeFrom, row.Description, row.TimeTo, row.ResourceId, row.EmployeeId, row.MovementTypeId, row.Capacity, row.CanBook, row.NonMemberFlag, row.MemberAmount, row.NonMemberAmount, row.Attendance]) #write Rows to CSV i = i+1 except AttributeError as error: print('An error occured: {0}'.format(error)) else: query_job = bigquery_client.query(query) results = query_job.result() with open (filename, 'w', newline='') as f: #Create CSV file write = csv.writer(f,dialect='excel',lineterminator='\n') try: for row in results: print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId, row.ScheduleId, row.Date, row.TimeFrom, row.Description, row.TimeTo, row.ResourceId, row.EmployeeId, row.MovementTypeId, row.Capacity, row.CanBook, row.NonMemberFlag, row.MemberAmount, row.NonMemberAmount, row.Attendance)) write.writerow([row.EventId, row.ScheduleId, row.Date, row.TimeFrom, row.Description, row.TimeTo, row.ResourceId, row.EmployeeId, row.MovementTypeId, row.Capacity, row.CanBook, row.NonMemberFlag, row.MemberAmount, row.NonMemberAmount, row.Attendance]) #write Rows to CSV except AttributeError as error: print('An error occured: {0}'.format(error)) return #Creates a dataset in BigQuery def createDataset(datasetname): dataset_ref = bigquery_client.dataset(datasetname) dataset = Dataset(dataset_ref) dataset.location = 'US' dataset = bigquery_client.create_dataset(dataset) return def getDataset(datasetname): dataset = bigquery_client.dataset(datasetname) return dataset def createTable(tablename, global_dataset_ref): schema = [ #Enter Schema here. # SchemaField('url', 'STRING', mode='required'), # SchemaField('views', 'INTEGER', mode='required') ] table_ref = global_dataset_ref.table(tablename) table = Table(table_ref, schema=schema) table = bigquery_client.create_table(table) assert table.table_id == tablename return def getTable(tablename, global_dataset_ref): table_ref = global_dataset_ref.table(tablename) table = bigquery_client.get_table(table_ref) # print(table.table_id) print(table.schema) # print(table.description) # print(table.num_rows) return table def getTableSchema(tablename, global_dataset_ref): table_ref = global_dataset_ref.table(tablename) table = bigquery_client.get_table(table_ref) schema = table.schema return schema def loadDataFromCSV(tablename, global_dataset_ref, filename): schema = getTableSchema(tablename, global_dataset_ref) table_ref = global_dataset_ref.table(tablename) load_config = LoadJobConfig() load_config.source_format = bigquery.SourceFormat.CSV load_config.schema = schema load_config.autodetect = True load_config.allow_quoted_newlines = True with open (filename, 'rb') as readable: job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config) job.result() print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id)) return # Testing if __name__ == '__main__': datasetname = 'Data_Layer' tablename = 'Events' sqlquery = '''SELECT null as EventId, sc.scheduleid AS ScheduleId, NULL AS Description, sc.scheduledatefrom AS Date, sc.timestart AS TimeFrom, sc.timeduration AS TimeTo, r.resourceid AS ResourceId, sp.employeeid AS EmployeeId, NULL AS MovementTypeId, r.configheight AS Capacity, CASE WHEN st.schedulestatus IN (1, 3) THEN '1' ELSE '0' END CanBook, CASE WHEN sv.nonmembermayenroll = TRUE THEN '1' ELSE '0' END NonMemberFlag, COALESCE(ProgramPrice.pricemember, ServicePrice.pricemember, 0) AS MemberAmount, COALESCE(ProgramPrice.pricenonmember, ServicePrice.pricenonmember, 0) AS NonMemberAmount, 'N/A' AS Attendance FROM AloomaTest.SCSESSIONS s LEFT JOIN AloomaTest.SCSESSION_PROVIDERS sp ON sp.sessionid = s.sessionid LEFT JOIN AloomaTest.SCSESSION_RESOURCES sr ON sr.sessionid = s.sessionid LEFT JOIN AloomaTest.SCSCHEDULES sc ON sc.scheduleid = s.scheduleid LEFT JOIN AloomaTest._SCSCHEDULESTATUS ST ON ST.schedulestatus = sc.schedulestatus LEFT JOIN AloomaTest.SCRESOURCES r ON r.resourceid = sr.resourceid LEFT JOIN AloomaTest.SCSERVICES sv ON sv.serviceid = sc.serviceid LEFT JOIN AloomaTest.SCPROGREG_SEMCOURSES semc ON semc.serviceid = sc.serviceid AND semc.semesterid = sc.semesterid LEFT JOIN AloomaTest.SCPROGREG_PRICES ProgramPrice ON ProgramPrice.scheduleid = sc.scheduleid LEFT JOIN AloomaTest.SCPROGREG_PRICES ServicePrice ON ServicePrice.semcourseid = semc.semcourseid WHERE COALESCE(ProgramPrice.feetypeid, 0) = 0 AND COALESCE(ServicePrice.feetypeid, 0)= 0 and sc.scheduleid in(31207, 25936, 5761094, 832794, 9825, 17912) ''' #createDataset(datasetname) #Successfully tested this code 2018-09-24 global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24 #createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24 getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24 runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24 loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24 sample data ,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A ,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A ,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A ,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A ,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A ,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A ,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A ,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A
The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.
Select parquet based on partition date
I've some heavy logs on my cluster, I've parqueted all of them with the following partition schema: PARTITION_YEAR=2017/PARTITION_MONTH=07/PARTITION_DAY=12 For example, if I want to select all my log between 2017/07/12 and 2017/08/10 is there a way to do it effectively ? Or Do I have to loop over all days to read the partitions one by one ? Thanks,
You can use some regular expressions when loading files in pyspark : input_path = "PARTITION_YEAR=2017/PARTITION_MONTH=0{7/PARTITION_DAY={1[2-9],[2-3]*},8/PARTITION_DAY={0[1-9],10}}" df = spark.read.parquet(input_path) You can also generate a list of comma separated paths: input_path = ",".join(["PARTITION_YEAR=2017/PARTITION_MONTH=07/PARTITION_DAY=" + str(x) for x in range(12, 32)]) \ + ",".join(["PARTITION_YEAR=2017/PARTITION_MONTH=08/PARTITION_DAY=" + str(x) for x in range(1, 11)]) or using dates: import datetime as dt d1 = dt.date(2017,7,12) d2 = dt.date(2017,8,10) date_list = [d1 + dt.timedelta(days=x) for x in range(0, (d2 - d1).days + 1)] input_path = ",".join(["PARTITION_YEAR=2017/PARTITION_MONTH=%02d/PARTITION_DAY=%02d" % (d.month, d.day) for d in date_list])
Python script is locked when accessing SQLite database in loop
please watch through the code of my parser. It grabs some statistics from web pages accessing them in a loop and puts specified records in SQLite3 database. Everything is going right until the line 87 (the SQL statement), where the process consumes all CPU resources and in fact get blocked. File "./parser.py", line 86, in while (j < i): Database file in the beginning of the code is created with correct structure, so the problem is in loops. Inner block of main loop for season in season_list: works just fine. Here is the whole code of my script: #!/usr/bin/env python from bs4 import BeautifulStoneSoup from urllib2 import urlopen import re import sqlite3 from time import gmtime, strftime # Print start time print "We started at ", strftime("%Y-%m-%d %H:%M:%S", gmtime()) # Create DB print "Trying to create DB" con = sqlite3.connect('england.db') cur = con.cursor() sql = """\ CREATE TABLE english_premier_league ( id_match INTEGER PRIMARY KEY AUTOINCREMENT, season TEXT, tour INTEGER, date TEXT, home TEXT, visitor TEXT, home_score INTEGER, visitor_score INTEGER ); """ try: cur.executescript(sql) except sqlite3.DatabaseError as err: print "Error creating database: ", err else: print "Succesfully created your database..." con.commit() cur.close() con.close() # list of variables postfix = 2011 threshold = 1999 season_list = [] while postfix >= threshold: end = (postfix + 1) % 2000 if (end >= 10): season = str(postfix) + str(end) else: season = str(postfix) + str(0) + str(end) season_list.append(season) postfix -= 1 print season_list # main loop for season in season_list: href = 'http://www.stat-football.com/en/a/eng.php?b=10&d='+season+'&c=51' print href xml = urlopen(href).read() xmlSoup = BeautifulStoneSoup(xml) tablet = xmlSoup.find(attrs={"class" : "bd5"}) #Access DB con = sqlite3.connect('england.db') cur = con.cursor() #Parse site tour = tablet.findAll(attrs = { "class" : re.compile(r"^(s3|cc s3)$") }) date = tablet.findAll(text = re.compile(r"(0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[012])\.(19|20)\d\d")) home = tablet.findAll(attrs = {"class" : "nw"}) guest = tablet.findAll(attrs = {"class" : "s1"}) score = tablet.findAll(attrs = {"class" : "nw pr15"}) # def parse_string(sequence): result=[] for unit in sequence: text = ''.join(unit.findAll(text=True)) result.append(text.strip()) return result tour_list=parse_string(tour) home_list=parse_string(home) guest_list=parse_string(guest) score_list=parse_string(score) #Loop over found records to put them into sqlite3 DB i = len(tour_list) j = 0 while (j < i): sql_add = 'INSERT INTO english_premier_league (season, tour, date, home, visitor, home_score, visitor_score) VALUES (?, ?, ?, ?, ?, ?, ?)' match = (season, int(tour_list[j]), date[j], home_list[j], guest_list[j], int(score_list[j][0:1]), int(score_list[j][2:3])) try: cur.executemany(sql_add, match) except sqlite3.DatabaseError as err: print "Error matching the record: ", err else: con.commit() part = float(j)/float(i)*100 if (part%10 == 0): print (int(part)), "%" j += 1 cur.close() con.close() Also it may be useful to look at the end of strace output: getcwd("/home/vitaly/football_forecast/epl", 512) = 35 stat("/home/vitaly/football_forecast/epl/england.db", {st_mode=S_IFREG|0644, st_size=24576, ...}) = 0 open("/home/vitaly/football_forecast/epl/england.db", O_RDWR|O_CREAT, 0644) = 3 fcntl(3, F_GETFD) = 0 fcntl(3, F_SETFD, FD_CLOEXEC) = 0 fstat(3, {st_mode=S_IFREG|0644, st_size=24576, ...}) = 0 lseek(3, 0, SEEK_SET) = 0 read(3, "SQLite format 3\0\4\0\1\1\0# \0\0\1~\0\0\0\30"..., 100) = 100 I'm running Python 2.7 on Ubuntu 12.04. Thanks a lot.
Replace cur.executemany(sql_add, match) with cur.execute(sql_add, match). executemany() is used for performing the same operation multiple times over an iterable of values. For example, if you had this: match = [ (season1, tour1, date1, home1, visitor1, home_score1, visitor_score1), (season2, tour2, date2, home2, visitor2, home_score2, visitor_score2), (season3, tour3, date3, home3, visitor3, home_score3, visitor_score3) ] cur.executemany(sql_add, match) ... it would be appropriate, since the cursor could iterate over the tuples in match and perform the insert operation on each of them.