pyspark modify class attributes using spark.sql.rdd.foreach() - apache-spark

The main task is to connect Hive and read data using spark rdd.
I have tried the code below. Connection and reading are both successful, but when I want to modify the value of self.jobUserProfile, I failed. Then I print this value in three positions(masking in #1,#2 and #3). In the first position, the value is valid, but in the second and third position, the dict is empty. It seems that the modification has not been assigned into the class attribute.
I have tried response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10').collect() and iterate this dataframe, but when the data volume is too large, the performance may decline.
When I change response.rdd.foreach(lambda x: self.readLoginFunction(x)) to response.rdd.map(lambda x: self.readLoginFunction(x)), the target value in three position are all empty.
I'm a newbie in spark. Any advice could be helpful. Thanks in advance.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def __init__(self):
self.jobUserProfile = collections.defaultdict(dict)
def readLoginFunction(self, e):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
self.jobUserProfile[e[0]] = dic
print(self.jobUserProfile) #1
def readLogin(self, spark):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.foreach(lambda x: self.readLoginFunction(x))
print(self.jobUserProfile) #2
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
operateHive = OperateHive()
operateHive.readLogin(spark)
print(operateHive.jobUserProfile) #3

Finally the code below works.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile, devAppProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.top(1)[0][0]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
But when I remove devAppProfile, the code show like below:
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
operateHive.readLogin(spark, jobUserProfile, devAppProfile)
The rdd.map() won't work as there is no print in print(jobUserProfile).
Then I change the code like below, which works again.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.collect()[-1]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
The problem on the post is about closure. But I don't work out why the three versions on the answer work differently.

Related

Use dictionary instead of list of dictionary to reduce program complexity

Trying to validate the consistency between DynamoDB tables, Used list of dictionary to store dynamodb table items, Which is taking longer time for execution.
New to python, any help to convert List of dictionary to dictionary please, To reduce my program complexity .
#!/usr/bin/python
import sys
import boto3
import argparse
import argparse
def table_consistency_check(table, column_name):
paginator = dynamoClient.get_paginator('scan')
modified_accounts = []
params = {
'TableName': table
}
page_iterator = paginator.paginate(**params)
for page in page_iterator:
for item in page['Items']:
account = item['account_name']['S']
license_key = item[column_name]['S']
credentials = {
'account_name': account,
column_name: license_key
}
modified_accounts.append(credentials)
return modified_accounts
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Find all accounts with different license key and account key hash')
parser.add_argument('-r', '--region', nargs='?', type=str, default='us-west-2')
try:
args = parser.parse_args()
except:
exit_code = int(str(sys.exc_info()[1]))
accounts_table = 'accounts_table'
Credentail_table = 'credential_table'
dynamoClient = boto3.client('dynamodb', region_name=args.region)
account1 = table_consistency_check(accounts_table, 'license_key')
account2 = table_consistency_check(Credentail_table, 'access_key_hash')
output = []
for acct_item in account1:
for creds_item in account2:
if acct_item['account_name'] == creds_item['account_name']:
if creds_item['access_key_hash'].startswith('ORIGINAL_KEY_'):
val = creds_item['access_key_hash']
length = len('ORIGINAL_KEY_')
str = val[length:]
if acct_item['license_key'] != str:
output.append(creds_item['account_name'])
print('Duplicate record found')
print('Account Name : ' + acct_item['account_name'] + ', License Key : ' + acct_item[
'license_key'] + ', Access Key Hash : ' + creds_item['access_key_hash'])
if not output:
print('the tables are consistent, No duplicate item found')

Bigquery CSV file load fail

google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error
import csv
#Imports the Google Cloud BigQuery client library
from google.cloud import bigquery
from google.cloud.bigquery import Dataset
from google.cloud.bigquery import Table
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import SchemaField
filename = 'events.csv'
idNeeded=0
#Instantiates a client
bigquery_client = bigquery.Client()
#Runs a query from BigQuery
def runBigQueryQuery( query, filename, idNeeded ):
if idNeeded == 1:
i = 1
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([i,row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
i = i+1
except AttributeError as error:
print('An error occured: {0}'.format(error))
else:
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
except AttributeError as error:
print('An error occured: {0}'.format(error))
return
#Creates a dataset in BigQuery
def createDataset(datasetname):
dataset_ref = bigquery_client.dataset(datasetname)
dataset = Dataset(dataset_ref)
dataset.location = 'US'
dataset = bigquery_client.create_dataset(dataset)
return
def getDataset(datasetname):
dataset = bigquery_client.dataset(datasetname)
return dataset
def createTable(tablename, global_dataset_ref):
schema = [
#Enter Schema here.
# SchemaField('url', 'STRING', mode='required'),
# SchemaField('views', 'INTEGER', mode='required')
]
table_ref = global_dataset_ref.table(tablename)
table = Table(table_ref, schema=schema)
table = bigquery_client.create_table(table)
assert table.table_id == tablename
return
def getTable(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
# print(table.table_id)
print(table.schema)
# print(table.description)
# print(table.num_rows)
return table
def getTableSchema(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
schema = table.schema
return schema
def loadDataFromCSV(tablename, global_dataset_ref, filename):
schema = getTableSchema(tablename, global_dataset_ref)
table_ref = global_dataset_ref.table(tablename)
load_config = LoadJobConfig()
load_config.source_format = bigquery.SourceFormat.CSV
load_config.schema = schema
load_config.autodetect = True
load_config.allow_quoted_newlines = True
with open (filename, 'rb') as readable:
job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id))
return
# Testing
if __name__ == '__main__':
datasetname = 'Data_Layer'
tablename = 'Events'
sqlquery = '''SELECT
null as EventId,
sc.scheduleid AS ScheduleId,
NULL AS Description,
sc.scheduledatefrom AS Date,
sc.timestart AS TimeFrom,
sc.timeduration AS TimeTo,
r.resourceid AS ResourceId,
sp.employeeid AS EmployeeId,
NULL AS MovementTypeId,
r.configheight AS Capacity,
CASE
WHEN st.schedulestatus IN (1, 3) THEN '1'
ELSE '0'
END CanBook,
CASE
WHEN sv.nonmembermayenroll = TRUE THEN '1'
ELSE '0'
END NonMemberFlag,
COALESCE(ProgramPrice.pricemember,
ServicePrice.pricemember,
0) AS MemberAmount,
COALESCE(ProgramPrice.pricenonmember,
ServicePrice.pricenonmember,
0) AS NonMemberAmount,
'N/A' AS Attendance
FROM
AloomaTest.SCSESSIONS s
LEFT JOIN
AloomaTest.SCSESSION_PROVIDERS sp
ON
sp.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSESSION_RESOURCES sr
ON
sr.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSCHEDULES sc
ON
sc.scheduleid = s.scheduleid
LEFT JOIN
AloomaTest._SCSCHEDULESTATUS ST
ON
ST.schedulestatus = sc.schedulestatus
LEFT JOIN
AloomaTest.SCRESOURCES r
ON
r.resourceid = sr.resourceid
LEFT JOIN
AloomaTest.SCSERVICES sv
ON
sv.serviceid = sc.serviceid
LEFT JOIN
AloomaTest.SCPROGREG_SEMCOURSES semc
ON
semc.serviceid = sc.serviceid
AND semc.semesterid = sc.semesterid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ProgramPrice
ON
ProgramPrice.scheduleid = sc.scheduleid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ServicePrice
ON
ServicePrice.semcourseid = semc.semcourseid
WHERE
COALESCE(ProgramPrice.feetypeid,
0) = 0
AND COALESCE(ServicePrice.feetypeid,
0)= 0
and sc.scheduleid in(31207,
25936,
5761094,
832794,
9825,
17912)
'''
#createDataset(datasetname) #Successfully tested this code 2018-09-24
global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24
#createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24
loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24
sample data
,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A
,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A
,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A
,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A
The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.

How to call outside function in filter for RDD data

Now I know for dataFrame, we could use udf, but currently I am handling RDD, data. Except to use on top function, how can I define a method used by filter in all methods in the class
def date_filer_help(date1, date2):
date1_arr = date1.split("-")
date2_arr = date2.split("-")
for i in range(len(date1_arr)):
if int(date1_arr[i]) < int(date2_arr[i]):
return True
elif int(date1_arr[i]) > int(date2_arr[i]):
return False
return True
def date_filter(prev_date, date, end_date):
return date_filer_help(prev_date, date) and date_filer_help(date, end_date)
rdd = sc.textFile(action_file).map(lambda x: x.split(','))\
.filter(lambda x: date_filter("0000-00-00", x[0], "2016-06-30"))
I want to write date_filter as static in this class or other classes.Otherwise, in each method, I need to redefine two helps as above again and again. But it does not work if I run in cluster, how should I do it ?
Do you mean this:
class DataFilter(object):
def __init__(self):
self.sc = SparkContext()
#staticmethod
def date_filer_help(date1, date2):
return date1 <= date2
#staticmethod
def date_filter(prev_date, date, end_date):
return DataFilter.date_filer_help(prev_date, date) and DataFilter.date_filer_help(date, end_date)
def run(self):
rdd = self.sc.parallelize(
map(lambda x: ((datetime.date(2016, 6, 25) + datetime.timedelta(x)).strftime('%Y-%m-%d'), x), range(10)))
result = rdd.filter(lambda x: DataFilter.date_filter("0000-00-00", x[0], "2016-06-30"))
return result.collect()
if __name__ == '__main__':
print(DataFilter().run())

How to search for a sub string within a string using Pyspark

The image added contains sample of .
For example, if sentence contains "John" and "drives" it means John has a car and to get to work he drives. I'm attaching code I'm using to do it. However, the code doesn't work correctly and is too complicated. I will appreciate your help.
%pyspark
rdd = sc.textFile("./sample.txt")
col = rdd.map(lambda line: line.split('\t'))
#remove header
header = col.first() #extract header
col = col.filter(lambda line: line != header)
def convertToRow(line):
return Row(Name = line[0],Text = line[1])
#call the function on each row, then convert to dataframe
df = col.map(convertToRow).toDF()
from pyspark.sql.functions import udf
def splitParagraphIntoSentences(paragraph):
sentences = nltk.tokenize.sent_tokenize(paragraph)
return sentences
def tokenize(text):
text = text.lower().replace('\n', '')
text = re.sub(',', '', text)
tokens = text.split()
if(len(tokens)>1):
tokens = splitParagraphIntoSentences(text)
return tokens
tokenize = udf(lambda text: tokenize(text))
data = df.select('Name', tokenize(df.Text).alias("Text"))
def how(name,paragraph):
drive = ['drives']
walks = ['walks']
comingwith = ['coming with']
for s in paragraph:
s = s.split()
if ((any(s[i:i+len(drive)]==drive for i in xrange(len(s)-len(drive)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))):
return "Drives"
elif ((any(s[i:i+len(walks)]==walks for i in xrange(len(s)-len(walks)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))):
return "Walks"
elif ((any(s[i:i+len(comingwith)]==comingwith for i in xrange(len(s)-len(comingwith)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))):
return "Coming with"
def checkYesNo(name,paragraph):
drive = ['drives']
walks = ['walks']
comingwith = ['coming with']
for s in paragraph:
s = s.split()
if ((any(s[i:i+len(comingwith)]==comingwith for i in xrange(len(s)-len(comingwith)+1))) or (any(s[i:i+len(walks)]==walks for i in xrange(len(s)-len(walks)+1)))):
return "No"
else:
return "Yes"
how = udf(lambda name,paragraph: how(name,paragraph))
checkYesNo = udf(lambda name,paragraph: checkYesNo(name,paragraph))
final_df = data.select('Name', checkYesNo(data.Name, data.Text), how(data.Name, data.Text))
I'd do it like this:
import socket
class SparkUtil(object):
#staticmethod
def get_spark_context (host, venv, framework_name, parts):
os.environ['PYSPARK_PYTHON'] = "{0}/bin/python".format (venv)
from pyspark import SparkConf, SparkContext
from StringIO import StringIO
ip = socket.gethostbyname(socket.gethostname())
sparkConf = (SparkConf()
.setMaster(host)
.setAppName(framework_name))
return SparkContext(conf = sparkConf)
input_txt = [
[ "John", "John usually drives to work. He usually gets up early and drinks coffee. Mary usually joining him." ],
[ "Sam", "As opposed to John, Sam doesn't like to drive. Sam usually walks there." ],
[ "Mary", "Mary doesn't have driving license. Mary usually coming with John which picks her up from home." ]
]
def has_car (text):
return "drives" in text
def get_method (text):
method = None
for m in [ "drives", "walks", "coming with" ]:
if m in text:
method = m
break
return method
def process_row (row):
return [ row[0], has_car(row[1]), get_method(row[1]) ]
sc = SparkUtil.get_spark_context (host = "local[2]",
venv = "../starshome/venv",
framework_name = "app",
parts = 2)
print (sc.parallelize (input_txt).map (process_row).collect ())
The SparkUtil class you can probably ignore. I'm not using a notebook. This is just a straight up Spark app.

Why my spark streaming app does not show any out put

This is my follow up question from my earlier stack overflow question ,for which I did not get a response
I have tried writing this ,which does not throw up any error but it does not show any out put
My goal is to evaluate the Dstream objects with historical data RDD ,I could not
find any example for pyspark like this ( checking streaming RDD with static RDD
created before hand ) .Appreciate your response . Thanks
"""
Created on Thu May 05 16:23:15 2016
#author: bghosh
"""
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext,functions as func,Row
sc = SparkContext("local[2]", "realtimeApp")
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc,10)
files = ssc.textFileStream("hdfs://RealTimeInputFolder/")
########Lets get the data from the db which is relavant for streaming ###
driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
dataurl = "jdbc:sqlserver://devserver:1433"
db = "devDB"
table = "stream_helper"
credential = "dev_credential"
########basic data for evaluation purpose ########
#base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data.registerTempTable("base_data")
######
files_count = files.flatMap(lambda file: file.split( ))
#pattern = '(TranAmount=Decimal.{2})(.[0-9]*.[0-9]*)(\\S+ )(TranDescription=u.)([a-zA-z\\s]+)([\\S\\s]+ )(dSc=u.)([A-Z]{2}.[0-9]+)'
tranfiles = "wasb://vanspark01#vanspark01.blob.core.windows.net/RealTimeInputFolder01/"
def getSqlContextInstance(sparkContext):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
def preparse(logline):
#match = re.search(pattern,logline)
pre = logline.split(",")
return(
Row(
Customer_id = pre[-1],
trantype = pre[-4],
amount = float(pre[-5]))
)
def parse():
parsed_tran = ssc.textFileStream(tranfiles).map(preparse)
#success = parsed_tran.filter(lambda s: s[1] == 1).map(lambda x:x[0])
#fail = parsed_tran.filter(lambda s:s[1] == 0).map(lambda x:x[0])
"""if fail.count() > 0:
print "no of non parsed file : %d",fail.count()
"""
return parsed_tran#success
def check_historic(rdd):
#checking with the historical table #
try:
streamSqlcontext = getSqlContextInstance(rdd)
stream_df = streamSqlcontext.createDataFrame(rdd)
stream_df.registerTempTable("stream_df")
result_data_frame = streamSqlcontext.sql("select * from stream_df LEFT OUTER JOIN base_data on stream_df.Customer_id= base_data.Customer_id" )
result_data_frame.show()
except:
pass
#return result_data_frame.rdd
success = parse()
success.foreachRDD(check_historic)
ssc.start()
ssc.awaitTermination()

Resources