Getting "TypeError: cannot pickle '_thread.RLock' object" when using Broadcast Variable in Spark Cluster - apache-spark

The Spark Cluster has one Master and four Slaves. I have a small table in the database, so I read the table in Master node and set it as a Broadcast variable like this:
def checkTableExists(tablename, connection):
try:
query = 'SELECT * FROM {0}'.format(str(tablename))
print(query)
connection.cursor.execute(query)
return True
except odb.DatabaseError as e:
x = e.args[0]
if x.code == 942: ## Only catch ORA-00942: table or view does not exist error
return False
else:
raise e
def read_cheque_paper(spark, connection):
if create_tables.checkTableExists(small_table, connection) == True:
query_cheque = '''select * from {}'''.format(small_table)
df = connection.read_sql(query_cheque)
broadcastStates = spark.sparkContext.broadcast(df)
else:
df = spark.createDataFrame([], StructType([]))
broadcastStates = spark.sparkContext.broadcast(df)
return broadcastStates
def trx_features(dataframe, broadcast_df):
df_small = broadcast_df.value
if df_small.empty:
df = df.withColumn('field_check', lit(None).cast(StringType()))
else:
df = calculate_field_check(dataframe, df_small)
df.select('field_check').where('field_check==1').show()
return df
def distribute_data(spark):
connection = cx_Oracle.connect(
config.username,
config.password,
config.dsn,
encoding=config.encoding)
if gf.flag_trx:
source_table_name, table_name = gf.create_query('trx')
broadcast_df = read_cheque_paper(spark, connection)
predicates = list_predicates(connection, source_table_name, date_field)
dataframe = gf.read_spark(spark, table_name, predicates, connection)
df_trx = trx_features(dataframe, broadcast_df)
write_spark(dist_table_name_trx, df_trx)
return
if __name__ == '__main__':
master = "spark://IP master:7077"
conf = SparkConf() \
.setAppName('ETL') \
.setMaster(master) \
.set("spark.jars", "ojdbc8.jar")
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession
spark.sparkContext.setLogLevel('ERROR')
start_time = datetime.datetime.now().timestamp()
distribute_data(spark)
end_time = datetime.datetime.now().timestamp()
duration = end_time - start_time
When this program run in local mode,I get the result without any errors. But, while I run that in the cluster,when it reach to the df_small = broadcast_df.value it was stopped with this error:
TypeError: cannot pickle '_thread.RLock' object
Would you please guide me what is wrong with defining the broadcast varible?
Any help is really appreciated .

Related

pyspark modify class attributes using spark.sql.rdd.foreach()

The main task is to connect Hive and read data using spark rdd.
I have tried the code below. Connection and reading are both successful, but when I want to modify the value of self.jobUserProfile, I failed. Then I print this value in three positions(masking in #1,#2 and #3). In the first position, the value is valid, but in the second and third position, the dict is empty. It seems that the modification has not been assigned into the class attribute.
I have tried response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10').collect() and iterate this dataframe, but when the data volume is too large, the performance may decline.
When I change response.rdd.foreach(lambda x: self.readLoginFunction(x)) to response.rdd.map(lambda x: self.readLoginFunction(x)), the target value in three position are all empty.
I'm a newbie in spark. Any advice could be helpful. Thanks in advance.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def __init__(self):
self.jobUserProfile = collections.defaultdict(dict)
def readLoginFunction(self, e):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
self.jobUserProfile[e[0]] = dic
print(self.jobUserProfile) #1
def readLogin(self, spark):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.foreach(lambda x: self.readLoginFunction(x))
print(self.jobUserProfile) #2
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
operateHive = OperateHive()
operateHive.readLogin(spark)
print(operateHive.jobUserProfile) #3
Finally the code below works.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile, devAppProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.top(1)[0][0]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
But when I remove devAppProfile, the code show like below:
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
operateHive.readLogin(spark, jobUserProfile, devAppProfile)
The rdd.map() won't work as there is no print in print(jobUserProfile).
Then I change the code like below, which works again.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.collect()[-1]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
The problem on the post is about closure. But I don't work out why the three versions on the answer work differently.

How to efficiently write pandas melt and join in order to run inside containers without causing SystemOOM exceptions?

I have a code which is running inside a docker container, on kubernetes cluster. The issue is that regardless of the memory limit set, sometimes, the code just fails, i.e. airflow task running the code gets failed.
I tried checking in on memory and CPU utilization. Both of them are under limits and pod never restarts. Note: my pod has only one container running and that is airflow's worker. It should be noted that CPU utilization reaches "tending to 1" and the task fails.
Since the dataset is transformed correctly, I am not posting the sample data. I am looking for efficiency in terms of resources.
My code is:
# Below code is required at the start of each script
from athena.api import get_pandas_df
import pandas as pd
last_correct_constant = 11
def remove_unwanted_cols(df, col_name):
unwanted_cols = []
for _col in df.columns:
if _col.startswith("unnamed"):
if int(_col.split("unnamed__")[-1]) > last_correct_constant:
unwanted_cols.append(_col)
else:
if not _col.startswith(col_name):
unwanted_cols.append(_col)
df = df.drop(columns=unwanted_cols)
return df
def sanitize_column_names(df):
corrected_columns = []
for column in df.columns:
corrected_columns.append(
column
.replace("(", "_")
.replace(")", "_")
.replace(" ", "_")
.replace("/", "_")
.replace(".", "_")
.replace(":", "_")
.lower())
df.columns = corrected_columns
return df
def get_first_row_as_header(df):
df.columns = df.iloc[0]
print("Columns are: ")
print(df.columns)
print("Head is: ")
df = df.iloc[1:]
print(df.head(1))
return df
def remove_cols_for_join(df, col_name):
unwanted_cols = []
for _col in df.columns:
if _col != 'period' and (not _col.startswith(col_name)) and _col != 'Markets':
unwanted_cols.append(_col)
print("Unwanted cols are: ")
print(unwanted_cols)
df = df.drop(columns=unwanted_cols)
return df
def main(*args, **kwargs):
""" Put your main logic here.
Help:
To get pandas dataframe of upstream nodes.
data_frame = get_pandas_df("<upstream_node_name>")
Example: data_frame = get_pandas_df("S3")
Return:
output_data_frame {pandas.DataFrame}
This data frame will be transferred to downstream nodes.
"""
# read dataframes
df = get_pandas_df("CSV")
df = sanitize_column_names(df)
df_sales = df
df_gr_vol = df
df_gr_val = df
print("remove unwanted cols for individual melts")
df = remove_unwanted_cols(df, 'value_offtake_000_rs__')
df_sales = remove_unwanted_cols(df_sales, 'sales_volume__volume_litres__')
df_gr_vol = remove_unwanted_cols(df_gr_vol, 'gr_vol_ya')
df_gr_val = remove_unwanted_cols(df_gr_val, 'gr_val_ya')
df = get_first_row_as_header(df)
df_sales = get_first_row_as_header(df_sales)
df_gr_vol = get_first_row_as_header(df_gr_vol)
df_gr_val = get_first_row_as_header(df_gr_val)
print("melting dataframes")
table_columns = df.columns
df = pd.melt(
df, id_vars=table_columns[:last_correct_constant+1],
value_vars=table_columns[last_correct_constant+1:], var_name='period',
value_name='value_offtake_000_rs__')
df = df[(df["Markets"] != '')]
table_columns = df_sales.columns
df_sales = pd.melt(
df_sales, id_vars=table_columns[:last_correct_constant+1],
value_vars=table_columns[last_correct_constant+1:], var_name='period',
value_name='sales_volume__volume_litres__')
df_sales = df_sales[(df_sales["Markets"] != '')]
df_sales = remove_cols_for_join(df_sales, 'sales_volume__volume_litres__')
table_columns = df_gr_vol.columns
df_gr_vol = pd.melt(
df_gr_vol, id_vars=table_columns[:last_correct_constant+1],
value_vars=table_columns[last_correct_constant+1:], var_name='period',
value_name='gr_vol_ya')
df_gr_vol = df_gr_vol[(df_gr_vol["Markets"] != '')]
df_gr_vol = remove_cols_for_join(df_gr_vol, 'gr_vol_ya')
table_columns = df_gr_val.columns
df_gr_val = pd.melt(
df_gr_val, id_vars=table_columns[:last_correct_constant+1],
value_vars=table_columns[last_correct_constant+1:], var_name='period',
value_name='gr_val_ya')
df_gr_val = df_gr_val[(df_gr_val["Markets"] != '')]
df_gr_val = remove_cols_for_join(df_gr_val, 'gr_val_ya')
print("Before merge: ")
for _col in df.columns :
print(_col)
print("==================")
for _col in df_sales.columns :
print(_col)
df = pd.merge(df, df_sales, on=['Markets', 'period'])
df = pd.merge(df, df_gr_val, on=['Markets', 'period'])
df = pd.merge(df, df_gr_vol, on=['Markets', 'period'])
df = sanitize_column_names(df)
return df
I expect this code to run with efficiently using memory and cpu. My current memory is set for 32GB and 10CPU cores.
The data contains 14 rows and 637 columns which I transform in an aforementioned way.

Bigquery CSV file load fail

google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error
import csv
#Imports the Google Cloud BigQuery client library
from google.cloud import bigquery
from google.cloud.bigquery import Dataset
from google.cloud.bigquery import Table
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import SchemaField
filename = 'events.csv'
idNeeded=0
#Instantiates a client
bigquery_client = bigquery.Client()
#Runs a query from BigQuery
def runBigQueryQuery( query, filename, idNeeded ):
if idNeeded == 1:
i = 1
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([i,row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
i = i+1
except AttributeError as error:
print('An error occured: {0}'.format(error))
else:
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
except AttributeError as error:
print('An error occured: {0}'.format(error))
return
#Creates a dataset in BigQuery
def createDataset(datasetname):
dataset_ref = bigquery_client.dataset(datasetname)
dataset = Dataset(dataset_ref)
dataset.location = 'US'
dataset = bigquery_client.create_dataset(dataset)
return
def getDataset(datasetname):
dataset = bigquery_client.dataset(datasetname)
return dataset
def createTable(tablename, global_dataset_ref):
schema = [
#Enter Schema here.
# SchemaField('url', 'STRING', mode='required'),
# SchemaField('views', 'INTEGER', mode='required')
]
table_ref = global_dataset_ref.table(tablename)
table = Table(table_ref, schema=schema)
table = bigquery_client.create_table(table)
assert table.table_id == tablename
return
def getTable(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
# print(table.table_id)
print(table.schema)
# print(table.description)
# print(table.num_rows)
return table
def getTableSchema(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
schema = table.schema
return schema
def loadDataFromCSV(tablename, global_dataset_ref, filename):
schema = getTableSchema(tablename, global_dataset_ref)
table_ref = global_dataset_ref.table(tablename)
load_config = LoadJobConfig()
load_config.source_format = bigquery.SourceFormat.CSV
load_config.schema = schema
load_config.autodetect = True
load_config.allow_quoted_newlines = True
with open (filename, 'rb') as readable:
job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id))
return
# Testing
if __name__ == '__main__':
datasetname = 'Data_Layer'
tablename = 'Events'
sqlquery = '''SELECT
null as EventId,
sc.scheduleid AS ScheduleId,
NULL AS Description,
sc.scheduledatefrom AS Date,
sc.timestart AS TimeFrom,
sc.timeduration AS TimeTo,
r.resourceid AS ResourceId,
sp.employeeid AS EmployeeId,
NULL AS MovementTypeId,
r.configheight AS Capacity,
CASE
WHEN st.schedulestatus IN (1, 3) THEN '1'
ELSE '0'
END CanBook,
CASE
WHEN sv.nonmembermayenroll = TRUE THEN '1'
ELSE '0'
END NonMemberFlag,
COALESCE(ProgramPrice.pricemember,
ServicePrice.pricemember,
0) AS MemberAmount,
COALESCE(ProgramPrice.pricenonmember,
ServicePrice.pricenonmember,
0) AS NonMemberAmount,
'N/A' AS Attendance
FROM
AloomaTest.SCSESSIONS s
LEFT JOIN
AloomaTest.SCSESSION_PROVIDERS sp
ON
sp.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSESSION_RESOURCES sr
ON
sr.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSCHEDULES sc
ON
sc.scheduleid = s.scheduleid
LEFT JOIN
AloomaTest._SCSCHEDULESTATUS ST
ON
ST.schedulestatus = sc.schedulestatus
LEFT JOIN
AloomaTest.SCRESOURCES r
ON
r.resourceid = sr.resourceid
LEFT JOIN
AloomaTest.SCSERVICES sv
ON
sv.serviceid = sc.serviceid
LEFT JOIN
AloomaTest.SCPROGREG_SEMCOURSES semc
ON
semc.serviceid = sc.serviceid
AND semc.semesterid = sc.semesterid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ProgramPrice
ON
ProgramPrice.scheduleid = sc.scheduleid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ServicePrice
ON
ServicePrice.semcourseid = semc.semcourseid
WHERE
COALESCE(ProgramPrice.feetypeid,
0) = 0
AND COALESCE(ServicePrice.feetypeid,
0)= 0
and sc.scheduleid in(31207,
25936,
5761094,
832794,
9825,
17912)
'''
#createDataset(datasetname) #Successfully tested this code 2018-09-24
global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24
#createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24
loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24
sample data
,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A
,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A
,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A
,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A
The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.

Why my spark streaming app does not show any out put

This is my follow up question from my earlier stack overflow question ,for which I did not get a response
I have tried writing this ,which does not throw up any error but it does not show any out put
My goal is to evaluate the Dstream objects with historical data RDD ,I could not
find any example for pyspark like this ( checking streaming RDD with static RDD
created before hand ) .Appreciate your response . Thanks
"""
Created on Thu May 05 16:23:15 2016
#author: bghosh
"""
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext,functions as func,Row
sc = SparkContext("local[2]", "realtimeApp")
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc,10)
files = ssc.textFileStream("hdfs://RealTimeInputFolder/")
########Lets get the data from the db which is relavant for streaming ###
driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
dataurl = "jdbc:sqlserver://devserver:1433"
db = "devDB"
table = "stream_helper"
credential = "dev_credential"
########basic data for evaluation purpose ########
#base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data.registerTempTable("base_data")
######
files_count = files.flatMap(lambda file: file.split( ))
#pattern = '(TranAmount=Decimal.{2})(.[0-9]*.[0-9]*)(\\S+ )(TranDescription=u.)([a-zA-z\\s]+)([\\S\\s]+ )(dSc=u.)([A-Z]{2}.[0-9]+)'
tranfiles = "wasb://vanspark01#vanspark01.blob.core.windows.net/RealTimeInputFolder01/"
def getSqlContextInstance(sparkContext):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
def preparse(logline):
#match = re.search(pattern,logline)
pre = logline.split(",")
return(
Row(
Customer_id = pre[-1],
trantype = pre[-4],
amount = float(pre[-5]))
)
def parse():
parsed_tran = ssc.textFileStream(tranfiles).map(preparse)
#success = parsed_tran.filter(lambda s: s[1] == 1).map(lambda x:x[0])
#fail = parsed_tran.filter(lambda s:s[1] == 0).map(lambda x:x[0])
"""if fail.count() > 0:
print "no of non parsed file : %d",fail.count()
"""
return parsed_tran#success
def check_historic(rdd):
#checking with the historical table #
try:
streamSqlcontext = getSqlContextInstance(rdd)
stream_df = streamSqlcontext.createDataFrame(rdd)
stream_df.registerTempTable("stream_df")
result_data_frame = streamSqlcontext.sql("select * from stream_df LEFT OUTER JOIN base_data on stream_df.Customer_id= base_data.Customer_id" )
result_data_frame.show()
except:
pass
#return result_data_frame.rdd
success = parse()
success.foreachRDD(check_historic)
ssc.start()
ssc.awaitTermination()

Why is my Spark streaming app so slow?

I have a cluster with 4 nodes: 3 Spark nodes and 1 Solr node. My CPU is 8 core, my memory is 32 GB, disc space is SSD. I use cassandra as my database. My data amount is 22GB after 6 hours and I now have around 3,4 Million rows, which should be read in under 5 minutes.
But already it can't complete the task in this amount of time. My future plan is to read 100 Million rows in under 5 minutes. I am not sure what I can increase or do better to achieve this result now as well as to achieve my future goal. Is that even possible or would it be better to use spark for the real time analysis and use for example hadoop for longer tail data (older then 1 day or a couple of hours)?
Thanks a lot!
Here is my Spark app code:
import sys
import json
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime, timedelta
from dateutil.parser import parse
from cassandra.cluster import Cluster
import pytz
from dateutil.tz import tzutc
tz = pytz.timezone('')
appname = str(sys.argv[1])
source = str(sys.argv[2])
cluster = Cluster(['localhost']);
session_statis = cluster.connect('keyspace')
def read_json(x):
try:
y = json.loads(x)
except:
y = 0
return y
def TransformInData(x):
try:
body = json.loads(x['body'])
return (body['articles'])
except:
return 0
def axesTransformData(x):
try:
body = json.loads(x['body'])
return (body)
except:
return 0
def storeDataToCassandra(rdd):
rdd_cassandra =rdd.map(lambda x:(x[0],(x[0],x[1]['thumbnail'], x[1]['title'], x[1]['url'], datetime.strptime(parse(x[1]['created_at']).strftime('%Y-%m-%d %H:%M:%S'), "%Y-%m-%d %H:%M:%S"),source, x[1]['category'] if x[1]['category'] else '', x[1]['channel'],x[1]['genre']))) \
.subtract(articles)
rdd_article = rdd_cassandra.map(lambda x:Row(id=x[1][0],source=x[1][5],thumbnail=x[1][1],title=x[1][2],url=x[1][3],created_at=x[1][4],category=x[1][6],channel=x[1][7],genre=x[1][8]))
rdd_schedule = rdd_cassandra.map(lambda x:Row(source=x[1][5],type='article',scheduled_for=x[1][4]+timedelta(minutes=5),id=x[1][0]))
rdd_article_by_created_at = rdd_cassandra.map(lambda x:Row(source=x[1][5],created_at=x[1][4],article=x[1][0]))
rdd_article_by_url = rdd_cassandra.map(lambda x:Row(url=x[1][3],article=x[1][0]))
if rdd_article.count()>0:
result_rdd_article = sqlContext.createDataFrame(rdd_article)
result_rdd_article.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_schedule.count()>0:
result_rdd_schedule = sqlContext.createDataFrame(rdd_schedule)
result_rdd_schedule.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_article_by_created_at.count()>0:
result_rdd_article_by_created_at = sqlContext.createDataFrame(rdd_article_by_created_at)
result_rdd_article_by_created_at.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_article_by_url.count()>0:
result_rdd_article_by_url = sqlContext.createDataFrame(rdd_article_by_url)
result_rdd_article_by_url.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
#
def axesStoreToCassandra(rdd):
axes_rdd = rdd.map(lambda x:Row(article=x[1]['id'],at=datetime.now(),comments=x[1]['comments'],likes=x[1]['attitudes'],reads=0,shares=x[1]['reposts']))
if axes_rdd.count()>0:
result_axes_rdd = sqlContext.createDataFrame(axes_rdd)
result_axes_rdd.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
def joinstream(rdd):
article_channels = articlestat.join(channels).map(lambda x:(x[1][0]['id'],{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'genre':x[1][0]['genre'],'category':x[1][1]['category'],'author':x[1][1]['author']}))
speed_rdd = axes.map(lambda x:(x.article,[[x.at,x.comments,x.likes,x.reads,x.shares]])) \
.reduceByKey(lambda x,y:x+y) \
.map(lambda x:(x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],sorted(x[1],key=lambda y:y[0],reverse = True)[1]) if len(x[1])>=2 else (x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],[sorted(x[1],key=lambda y:y[0],reverse = True)[0][0]-timedelta(seconds=300),0,0,0,0])) \
.filter(lambda x:(x[1][0]-x[2][0]).seconds>0) \
.map(lambda x:(x[0],{'id':x[0],'comments':x[1][1],'likes':x[1][2],'reads':x[1][3],'shares':x[1][4],'speed':int(5*288*((x[1][4]-x[2][4])/((x[1][0]-x[2][0]).seconds/60.0)))})) \
.filter(lambda x:x[1]['speed']>=0) \
.filter(lambda x:x[1]['shares']>0)
statistics = article_channels.join(speed_rdd) \
.map(lambda x:{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][0]['author'],'genre':x[1][0]['genre'],'comments':x[1][1]['comments'],'likes':x[1][1]['likes'],'reads':x[1][1]['reads'],'shares':x[1][1]['shares'],'speed':x[1][1]['speed']})
timeone=datetime.now()-timedelta(hours=1)
timethree = datetime.now()-timedelta(hours=3)
timesix = datetime.now()-timedelta(hours=6)
timetwelve = datetime.now()-timedelta(hours=12)
timetwentyfour = datetime.now()-timedelta(hours=24)
result1 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timeone).map(lambda x:Row(timespan='1',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result3 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timethree and x['created_at']+timedelta(hours=8)<=timeone).map(lambda x:Row(timespan='3',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result6 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timesix and x['created_at']+timedelta(hours=8)<=timethree).map(lambda x:Row(timespan='6',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result12 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwelve and x['created_at']+timedelta(hours=8)<=timesix).map(lambda x:Row(timespan='12',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result24 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwentyfour and x['created_at']+timedelta(hours=8)<=timetwelve).map(lambda x:Row(timespan='24',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
if result1.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'1'))
resultschema1 = sqlContext.createDataFrame(result1)
resultschema1.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result3.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'3'))
resultschema3 = sqlContext.createDataFrame(result3)
resultschema3.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result6.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'6'))
resultschema6 = sqlContext.createDataFrame(result6)
resultschema6.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result12.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'12'))
resultschema12 = sqlContext.createDataFrame(result12)
resultschema12.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result24.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'24'))
resultschema24 = sqlContext.createDataFrame(result24)
resultschema24.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
conf = SparkConf().setAppName(appname)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc,30)
sqlContext = SQLContext(sc)
channels = sc.cassandraTable("keyspace","tablename").map(lambda x:(x.id,{'author':x.name,'category':x.category}))
articles = sc.cassandraTable('keyspace','tablename').map(lambda x:(x.id,(x.id,x.thumbnail,x.title,x.url,x.created_at+timedelta(hours=8),source,x.category,x.channel,x.genre)))
articlestat = sc.cassandraTable('keyspace','tablename').map(lambda x:(x.channel,{'id':x.id,'thumbnail':x.thumbnail,'title':x.title,'url':x.url,'created_at':x.created_at,'source':x.source,'category':x.category,'channel':x.channel,'genre':x.genre}))
axes = sc.cassandraTable('keyspace','tablename')
topic = 'topic1'
kafkaParams = {"metadata.broker.list": "localhost:9092"}
article_stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
article_join_stream=article_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(x['id'].encode("utf-8") ,x))
article_join_stream.transform(storeDataToCassandra).pprint()
axes_topic = 'topic2'
axes_stream = KafkaUtils.createDirectStream(ssc, [axes_topic], kafkaParams)
axes_join_stream = axes_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(str(x['id']),x))
axes_join_stream.transform(axesStoreToCassandra).pprint()
statistics = article_join_stream.map(lambda x:(x[0])).window(15*60,15*60)
statistics.transform(joinstream).pprint()
ssc.start()
EDIT:
This is the stage that seems to consume most time. Any thoughts on that?
At first glance it seems that you just start your application with "spark-submit <your application>"
This means you are using the default allocation of memory and CPU's to your application (which is about 1cpu and 512MB of ram in most default cases)
This is assuming you are using YARN since you don't provide info on this.
Start your application with the appropriate resources and you'll see improvements.
Edit:
I see you are using a lot of lambdas, those need to be serialized.
Do know that when using objects you are passing around the full object every time.
I.E. you are using the full object this.value and not just value.
To fix this, you could use a local variable _value = this.value and use that to proceed.
This might provide you with a speedup.

Resources