pyspark modify class attributes using spark.sql.rdd.foreach() - apache-spark
The main task is to connect Hive and read data using spark rdd.
I have tried the code below. Connection and reading are both successful, but when I want to modify the value of self.jobUserProfile, I failed. Then I print this value in three positions(masking in #1,#2 and #3). In the first position, the value is valid, but in the second and third position, the dict is empty. It seems that the modification has not been assigned into the class attribute.
I have tried response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10').collect() and iterate this dataframe, but when the data volume is too large, the performance may decline.
When I change response.rdd.foreach(lambda x: self.readLoginFunction(x)) to response.rdd.map(lambda x: self.readLoginFunction(x)), the target value in three position are all empty.
I'm a newbie in spark. Any advice could be helpful. Thanks in advance.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def __init__(self):
self.jobUserProfile = collections.defaultdict(dict)
def readLoginFunction(self, e):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
self.jobUserProfile[e[0]] = dic
print(self.jobUserProfile) #1
def readLogin(self, spark):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.foreach(lambda x: self.readLoginFunction(x))
print(self.jobUserProfile) #2
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
operateHive = OperateHive()
operateHive.readLogin(spark)
print(operateHive.jobUserProfile) #3
Finally the code below works.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile, devAppProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.top(1)[0][0]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
But when I remove devAppProfile, the code show like below:
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
operateHive.readLogin(spark, jobUserProfile, devAppProfile)
The rdd.map() won't work as there is no print in print(jobUserProfile).
Then I change the code like below, which works again.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.collect()[-1]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
The problem on the post is about closure. But I don't work out why the three versions on the answer work differently.
Related
Use dictionary instead of list of dictionary to reduce program complexity
Trying to validate the consistency between DynamoDB tables, Used list of dictionary to store dynamodb table items, Which is taking longer time for execution. New to python, any help to convert List of dictionary to dictionary please, To reduce my program complexity . #!/usr/bin/python import sys import boto3 import argparse import argparse def table_consistency_check(table, column_name): paginator = dynamoClient.get_paginator('scan') modified_accounts = [] params = { 'TableName': table } page_iterator = paginator.paginate(**params) for page in page_iterator: for item in page['Items']: account = item['account_name']['S'] license_key = item[column_name]['S'] credentials = { 'account_name': account, column_name: license_key } modified_accounts.append(credentials) return modified_accounts if __name__ == '__main__': parser = argparse.ArgumentParser(description='Find all accounts with different license key and account key hash') parser.add_argument('-r', '--region', nargs='?', type=str, default='us-west-2') try: args = parser.parse_args() except: exit_code = int(str(sys.exc_info()[1])) accounts_table = 'accounts_table' Credentail_table = 'credential_table' dynamoClient = boto3.client('dynamodb', region_name=args.region) account1 = table_consistency_check(accounts_table, 'license_key') account2 = table_consistency_check(Credentail_table, 'access_key_hash') output = [] for acct_item in account1: for creds_item in account2: if acct_item['account_name'] == creds_item['account_name']: if creds_item['access_key_hash'].startswith('ORIGINAL_KEY_'): val = creds_item['access_key_hash'] length = len('ORIGINAL_KEY_') str = val[length:] if acct_item['license_key'] != str: output.append(creds_item['account_name']) print('Duplicate record found') print('Account Name : ' + acct_item['account_name'] + ', License Key : ' + acct_item[ 'license_key'] + ', Access Key Hash : ' + creds_item['access_key_hash']) if not output: print('the tables are consistent, No duplicate item found')
Bigquery CSV file load fail
google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details. I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error import csv #Imports the Google Cloud BigQuery client library from google.cloud import bigquery from google.cloud.bigquery import Dataset from google.cloud.bigquery import Table from google.cloud.bigquery import LoadJobConfig from google.cloud.bigquery import SchemaField filename = 'events.csv' idNeeded=0 #Instantiates a client bigquery_client = bigquery.Client() #Runs a query from BigQuery def runBigQueryQuery( query, filename, idNeeded ): if idNeeded == 1: i = 1 query_job = bigquery_client.query(query) results = query_job.result() with open (filename, 'w', newline='') as f: #Create CSV file write = csv.writer(f,dialect='excel',lineterminator='\n') try: for row in results: print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId, row.ScheduleId, row.Date, row.TimeFrom, row.Description, row.TimeTo, row.ResourceId, row.EmployeeId, row.MovementTypeId, row.Capacity, row.CanBook, row.NonMemberFlag, row.MemberAmount, row.NonMemberAmount, row.Attendance)) write.writerow([i,row.EventId, row.ScheduleId, row.Date, row.TimeFrom, row.Description, row.TimeTo, row.ResourceId, row.EmployeeId, row.MovementTypeId, row.Capacity, row.CanBook, row.NonMemberFlag, row.MemberAmount, row.NonMemberAmount, row.Attendance]) #write Rows to CSV i = i+1 except AttributeError as error: print('An error occured: {0}'.format(error)) else: query_job = bigquery_client.query(query) results = query_job.result() with open (filename, 'w', newline='') as f: #Create CSV file write = csv.writer(f,dialect='excel',lineterminator='\n') try: for row in results: print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId, row.ScheduleId, row.Date, row.TimeFrom, row.Description, row.TimeTo, row.ResourceId, row.EmployeeId, row.MovementTypeId, row.Capacity, row.CanBook, row.NonMemberFlag, row.MemberAmount, row.NonMemberAmount, row.Attendance)) write.writerow([row.EventId, row.ScheduleId, row.Date, row.TimeFrom, row.Description, row.TimeTo, row.ResourceId, row.EmployeeId, row.MovementTypeId, row.Capacity, row.CanBook, row.NonMemberFlag, row.MemberAmount, row.NonMemberAmount, row.Attendance]) #write Rows to CSV except AttributeError as error: print('An error occured: {0}'.format(error)) return #Creates a dataset in BigQuery def createDataset(datasetname): dataset_ref = bigquery_client.dataset(datasetname) dataset = Dataset(dataset_ref) dataset.location = 'US' dataset = bigquery_client.create_dataset(dataset) return def getDataset(datasetname): dataset = bigquery_client.dataset(datasetname) return dataset def createTable(tablename, global_dataset_ref): schema = [ #Enter Schema here. # SchemaField('url', 'STRING', mode='required'), # SchemaField('views', 'INTEGER', mode='required') ] table_ref = global_dataset_ref.table(tablename) table = Table(table_ref, schema=schema) table = bigquery_client.create_table(table) assert table.table_id == tablename return def getTable(tablename, global_dataset_ref): table_ref = global_dataset_ref.table(tablename) table = bigquery_client.get_table(table_ref) # print(table.table_id) print(table.schema) # print(table.description) # print(table.num_rows) return table def getTableSchema(tablename, global_dataset_ref): table_ref = global_dataset_ref.table(tablename) table = bigquery_client.get_table(table_ref) schema = table.schema return schema def loadDataFromCSV(tablename, global_dataset_ref, filename): schema = getTableSchema(tablename, global_dataset_ref) table_ref = global_dataset_ref.table(tablename) load_config = LoadJobConfig() load_config.source_format = bigquery.SourceFormat.CSV load_config.schema = schema load_config.autodetect = True load_config.allow_quoted_newlines = True with open (filename, 'rb') as readable: job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config) job.result() print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id)) return # Testing if __name__ == '__main__': datasetname = 'Data_Layer' tablename = 'Events' sqlquery = '''SELECT null as EventId, sc.scheduleid AS ScheduleId, NULL AS Description, sc.scheduledatefrom AS Date, sc.timestart AS TimeFrom, sc.timeduration AS TimeTo, r.resourceid AS ResourceId, sp.employeeid AS EmployeeId, NULL AS MovementTypeId, r.configheight AS Capacity, CASE WHEN st.schedulestatus IN (1, 3) THEN '1' ELSE '0' END CanBook, CASE WHEN sv.nonmembermayenroll = TRUE THEN '1' ELSE '0' END NonMemberFlag, COALESCE(ProgramPrice.pricemember, ServicePrice.pricemember, 0) AS MemberAmount, COALESCE(ProgramPrice.pricenonmember, ServicePrice.pricenonmember, 0) AS NonMemberAmount, 'N/A' AS Attendance FROM AloomaTest.SCSESSIONS s LEFT JOIN AloomaTest.SCSESSION_PROVIDERS sp ON sp.sessionid = s.sessionid LEFT JOIN AloomaTest.SCSESSION_RESOURCES sr ON sr.sessionid = s.sessionid LEFT JOIN AloomaTest.SCSCHEDULES sc ON sc.scheduleid = s.scheduleid LEFT JOIN AloomaTest._SCSCHEDULESTATUS ST ON ST.schedulestatus = sc.schedulestatus LEFT JOIN AloomaTest.SCRESOURCES r ON r.resourceid = sr.resourceid LEFT JOIN AloomaTest.SCSERVICES sv ON sv.serviceid = sc.serviceid LEFT JOIN AloomaTest.SCPROGREG_SEMCOURSES semc ON semc.serviceid = sc.serviceid AND semc.semesterid = sc.semesterid LEFT JOIN AloomaTest.SCPROGREG_PRICES ProgramPrice ON ProgramPrice.scheduleid = sc.scheduleid LEFT JOIN AloomaTest.SCPROGREG_PRICES ServicePrice ON ServicePrice.semcourseid = semc.semcourseid WHERE COALESCE(ProgramPrice.feetypeid, 0) = 0 AND COALESCE(ServicePrice.feetypeid, 0)= 0 and sc.scheduleid in(31207, 25936, 5761094, 832794, 9825, 17912) ''' #createDataset(datasetname) #Successfully tested this code 2018-09-24 global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24 #createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24 getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24 runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24 loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24 sample data ,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A ,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A ,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A ,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A ,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A ,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A ,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A ,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A
The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.
How to call outside function in filter for RDD data
Now I know for dataFrame, we could use udf, but currently I am handling RDD, data. Except to use on top function, how can I define a method used by filter in all methods in the class def date_filer_help(date1, date2): date1_arr = date1.split("-") date2_arr = date2.split("-") for i in range(len(date1_arr)): if int(date1_arr[i]) < int(date2_arr[i]): return True elif int(date1_arr[i]) > int(date2_arr[i]): return False return True def date_filter(prev_date, date, end_date): return date_filer_help(prev_date, date) and date_filer_help(date, end_date) rdd = sc.textFile(action_file).map(lambda x: x.split(','))\ .filter(lambda x: date_filter("0000-00-00", x[0], "2016-06-30")) I want to write date_filter as static in this class or other classes.Otherwise, in each method, I need to redefine two helps as above again and again. But it does not work if I run in cluster, how should I do it ?
Do you mean this: class DataFilter(object): def __init__(self): self.sc = SparkContext() #staticmethod def date_filer_help(date1, date2): return date1 <= date2 #staticmethod def date_filter(prev_date, date, end_date): return DataFilter.date_filer_help(prev_date, date) and DataFilter.date_filer_help(date, end_date) def run(self): rdd = self.sc.parallelize( map(lambda x: ((datetime.date(2016, 6, 25) + datetime.timedelta(x)).strftime('%Y-%m-%d'), x), range(10))) result = rdd.filter(lambda x: DataFilter.date_filter("0000-00-00", x[0], "2016-06-30")) return result.collect() if __name__ == '__main__': print(DataFilter().run())
How to search for a sub string within a string using Pyspark
The image added contains sample of . For example, if sentence contains "John" and "drives" it means John has a car and to get to work he drives. I'm attaching code I'm using to do it. However, the code doesn't work correctly and is too complicated. I will appreciate your help. %pyspark rdd = sc.textFile("./sample.txt") col = rdd.map(lambda line: line.split('\t')) #remove header header = col.first() #extract header col = col.filter(lambda line: line != header) def convertToRow(line): return Row(Name = line[0],Text = line[1]) #call the function on each row, then convert to dataframe df = col.map(convertToRow).toDF() from pyspark.sql.functions import udf def splitParagraphIntoSentences(paragraph): sentences = nltk.tokenize.sent_tokenize(paragraph) return sentences def tokenize(text): text = text.lower().replace('\n', '') text = re.sub(',', '', text) tokens = text.split() if(len(tokens)>1): tokens = splitParagraphIntoSentences(text) return tokens tokenize = udf(lambda text: tokenize(text)) data = df.select('Name', tokenize(df.Text).alias("Text")) def how(name,paragraph): drive = ['drives'] walks = ['walks'] comingwith = ['coming with'] for s in paragraph: s = s.split() if ((any(s[i:i+len(drive)]==drive for i in xrange(len(s)-len(drive)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))): return "Drives" elif ((any(s[i:i+len(walks)]==walks for i in xrange(len(s)-len(walks)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))): return "Walks" elif ((any(s[i:i+len(comingwith)]==comingwith for i in xrange(len(s)-len(comingwith)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))): return "Coming with" def checkYesNo(name,paragraph): drive = ['drives'] walks = ['walks'] comingwith = ['coming with'] for s in paragraph: s = s.split() if ((any(s[i:i+len(comingwith)]==comingwith for i in xrange(len(s)-len(comingwith)+1))) or (any(s[i:i+len(walks)]==walks for i in xrange(len(s)-len(walks)+1)))): return "No" else: return "Yes" how = udf(lambda name,paragraph: how(name,paragraph)) checkYesNo = udf(lambda name,paragraph: checkYesNo(name,paragraph)) final_df = data.select('Name', checkYesNo(data.Name, data.Text), how(data.Name, data.Text))
I'd do it like this: import socket class SparkUtil(object): #staticmethod def get_spark_context (host, venv, framework_name, parts): os.environ['PYSPARK_PYTHON'] = "{0}/bin/python".format (venv) from pyspark import SparkConf, SparkContext from StringIO import StringIO ip = socket.gethostbyname(socket.gethostname()) sparkConf = (SparkConf() .setMaster(host) .setAppName(framework_name)) return SparkContext(conf = sparkConf) input_txt = [ [ "John", "John usually drives to work. He usually gets up early and drinks coffee. Mary usually joining him." ], [ "Sam", "As opposed to John, Sam doesn't like to drive. Sam usually walks there." ], [ "Mary", "Mary doesn't have driving license. Mary usually coming with John which picks her up from home." ] ] def has_car (text): return "drives" in text def get_method (text): method = None for m in [ "drives", "walks", "coming with" ]: if m in text: method = m break return method def process_row (row): return [ row[0], has_car(row[1]), get_method(row[1]) ] sc = SparkUtil.get_spark_context (host = "local[2]", venv = "../starshome/venv", framework_name = "app", parts = 2) print (sc.parallelize (input_txt).map (process_row).collect ()) The SparkUtil class you can probably ignore. I'm not using a notebook. This is just a straight up Spark app.
Why my spark streaming app does not show any out put
This is my follow up question from my earlier stack overflow question ,for which I did not get a response I have tried writing this ,which does not throw up any error but it does not show any out put My goal is to evaluate the Dstream objects with historical data RDD ,I could not find any example for pyspark like this ( checking streaming RDD with static RDD created before hand ) .Appreciate your response . Thanks """ Created on Thu May 05 16:23:15 2016 #author: bghosh """ import re from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext,functions as func,Row sc = SparkContext("local[2]", "realtimeApp") sqlContext = SQLContext(sc) ssc = StreamingContext(sc,10) files = ssc.textFileStream("hdfs://RealTimeInputFolder/") ########Lets get the data from the db which is relavant for streaming ### driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver" dataurl = "jdbc:sqlserver://devserver:1433" db = "devDB" table = "stream_helper" credential = "dev_credential" ########basic data for evaluation purpose ######## #base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load() base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load() base_data.registerTempTable("base_data") ###### files_count = files.flatMap(lambda file: file.split( )) #pattern = '(TranAmount=Decimal.{2})(.[0-9]*.[0-9]*)(\\S+ )(TranDescription=u.)([a-zA-z\\s]+)([\\S\\s]+ )(dSc=u.)([A-Z]{2}.[0-9]+)' tranfiles = "wasb://vanspark01#vanspark01.blob.core.windows.net/RealTimeInputFolder01/" def getSqlContextInstance(sparkContext): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext) return globals()['sqlContextSingletonInstance'] def preparse(logline): #match = re.search(pattern,logline) pre = logline.split(",") return( Row( Customer_id = pre[-1], trantype = pre[-4], amount = float(pre[-5])) ) def parse(): parsed_tran = ssc.textFileStream(tranfiles).map(preparse) #success = parsed_tran.filter(lambda s: s[1] == 1).map(lambda x:x[0]) #fail = parsed_tran.filter(lambda s:s[1] == 0).map(lambda x:x[0]) """if fail.count() > 0: print "no of non parsed file : %d",fail.count() """ return parsed_tran#success def check_historic(rdd): #checking with the historical table # try: streamSqlcontext = getSqlContextInstance(rdd) stream_df = streamSqlcontext.createDataFrame(rdd) stream_df.registerTempTable("stream_df") result_data_frame = streamSqlcontext.sql("select * from stream_df LEFT OUTER JOIN base_data on stream_df.Customer_id= base_data.Customer_id" ) result_data_frame.show() except: pass #return result_data_frame.rdd success = parse() success.foreachRDD(check_historic) ssc.start() ssc.awaitTermination()