Why is my Spark streaming app so slow? - apache-spark
I have a cluster with 4 nodes: 3 Spark nodes and 1 Solr node. My CPU is 8 core, my memory is 32 GB, disc space is SSD. I use cassandra as my database. My data amount is 22GB after 6 hours and I now have around 3,4 Million rows, which should be read in under 5 minutes.
But already it can't complete the task in this amount of time. My future plan is to read 100 Million rows in under 5 minutes. I am not sure what I can increase or do better to achieve this result now as well as to achieve my future goal. Is that even possible or would it be better to use spark for the real time analysis and use for example hadoop for longer tail data (older then 1 day or a couple of hours)?
Thanks a lot!
Here is my Spark app code:
import sys
import json
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime, timedelta
from dateutil.parser import parse
from cassandra.cluster import Cluster
import pytz
from dateutil.tz import tzutc
tz = pytz.timezone('')
appname = str(sys.argv[1])
source = str(sys.argv[2])
cluster = Cluster(['localhost']);
session_statis = cluster.connect('keyspace')
def read_json(x):
try:
y = json.loads(x)
except:
y = 0
return y
def TransformInData(x):
try:
body = json.loads(x['body'])
return (body['articles'])
except:
return 0
def axesTransformData(x):
try:
body = json.loads(x['body'])
return (body)
except:
return 0
def storeDataToCassandra(rdd):
rdd_cassandra =rdd.map(lambda x:(x[0],(x[0],x[1]['thumbnail'], x[1]['title'], x[1]['url'], datetime.strptime(parse(x[1]['created_at']).strftime('%Y-%m-%d %H:%M:%S'), "%Y-%m-%d %H:%M:%S"),source, x[1]['category'] if x[1]['category'] else '', x[1]['channel'],x[1]['genre']))) \
.subtract(articles)
rdd_article = rdd_cassandra.map(lambda x:Row(id=x[1][0],source=x[1][5],thumbnail=x[1][1],title=x[1][2],url=x[1][3],created_at=x[1][4],category=x[1][6],channel=x[1][7],genre=x[1][8]))
rdd_schedule = rdd_cassandra.map(lambda x:Row(source=x[1][5],type='article',scheduled_for=x[1][4]+timedelta(minutes=5),id=x[1][0]))
rdd_article_by_created_at = rdd_cassandra.map(lambda x:Row(source=x[1][5],created_at=x[1][4],article=x[1][0]))
rdd_article_by_url = rdd_cassandra.map(lambda x:Row(url=x[1][3],article=x[1][0]))
if rdd_article.count()>0:
result_rdd_article = sqlContext.createDataFrame(rdd_article)
result_rdd_article.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_schedule.count()>0:
result_rdd_schedule = sqlContext.createDataFrame(rdd_schedule)
result_rdd_schedule.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_article_by_created_at.count()>0:
result_rdd_article_by_created_at = sqlContext.createDataFrame(rdd_article_by_created_at)
result_rdd_article_by_created_at.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_article_by_url.count()>0:
result_rdd_article_by_url = sqlContext.createDataFrame(rdd_article_by_url)
result_rdd_article_by_url.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
#
def axesStoreToCassandra(rdd):
axes_rdd = rdd.map(lambda x:Row(article=x[1]['id'],at=datetime.now(),comments=x[1]['comments'],likes=x[1]['attitudes'],reads=0,shares=x[1]['reposts']))
if axes_rdd.count()>0:
result_axes_rdd = sqlContext.createDataFrame(axes_rdd)
result_axes_rdd.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
def joinstream(rdd):
article_channels = articlestat.join(channels).map(lambda x:(x[1][0]['id'],{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'genre':x[1][0]['genre'],'category':x[1][1]['category'],'author':x[1][1]['author']}))
speed_rdd = axes.map(lambda x:(x.article,[[x.at,x.comments,x.likes,x.reads,x.shares]])) \
.reduceByKey(lambda x,y:x+y) \
.map(lambda x:(x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],sorted(x[1],key=lambda y:y[0],reverse = True)[1]) if len(x[1])>=2 else (x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],[sorted(x[1],key=lambda y:y[0],reverse = True)[0][0]-timedelta(seconds=300),0,0,0,0])) \
.filter(lambda x:(x[1][0]-x[2][0]).seconds>0) \
.map(lambda x:(x[0],{'id':x[0],'comments':x[1][1],'likes':x[1][2],'reads':x[1][3],'shares':x[1][4],'speed':int(5*288*((x[1][4]-x[2][4])/((x[1][0]-x[2][0]).seconds/60.0)))})) \
.filter(lambda x:x[1]['speed']>=0) \
.filter(lambda x:x[1]['shares']>0)
statistics = article_channels.join(speed_rdd) \
.map(lambda x:{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][0]['author'],'genre':x[1][0]['genre'],'comments':x[1][1]['comments'],'likes':x[1][1]['likes'],'reads':x[1][1]['reads'],'shares':x[1][1]['shares'],'speed':x[1][1]['speed']})
timeone=datetime.now()-timedelta(hours=1)
timethree = datetime.now()-timedelta(hours=3)
timesix = datetime.now()-timedelta(hours=6)
timetwelve = datetime.now()-timedelta(hours=12)
timetwentyfour = datetime.now()-timedelta(hours=24)
result1 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timeone).map(lambda x:Row(timespan='1',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result3 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timethree and x['created_at']+timedelta(hours=8)<=timeone).map(lambda x:Row(timespan='3',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result6 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timesix and x['created_at']+timedelta(hours=8)<=timethree).map(lambda x:Row(timespan='6',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result12 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwelve and x['created_at']+timedelta(hours=8)<=timesix).map(lambda x:Row(timespan='12',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result24 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwentyfour and x['created_at']+timedelta(hours=8)<=timetwelve).map(lambda x:Row(timespan='24',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre=x['genre'],reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
if result1.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'1'))
resultschema1 = sqlContext.createDataFrame(result1)
resultschema1.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result3.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'3'))
resultschema3 = sqlContext.createDataFrame(result3)
resultschema3.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result6.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'6'))
resultschema6 = sqlContext.createDataFrame(result6)
resultschema6.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result12.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'12'))
resultschema12 = sqlContext.createDataFrame(result12)
resultschema12.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result24.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'24'))
resultschema24 = sqlContext.createDataFrame(result24)
resultschema24.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
conf = SparkConf().setAppName(appname)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc,30)
sqlContext = SQLContext(sc)
channels = sc.cassandraTable("keyspace","tablename").map(lambda x:(x.id,{'author':x.name,'category':x.category}))
articles = sc.cassandraTable('keyspace','tablename').map(lambda x:(x.id,(x.id,x.thumbnail,x.title,x.url,x.created_at+timedelta(hours=8),source,x.category,x.channel,x.genre)))
articlestat = sc.cassandraTable('keyspace','tablename').map(lambda x:(x.channel,{'id':x.id,'thumbnail':x.thumbnail,'title':x.title,'url':x.url,'created_at':x.created_at,'source':x.source,'category':x.category,'channel':x.channel,'genre':x.genre}))
axes = sc.cassandraTable('keyspace','tablename')
topic = 'topic1'
kafkaParams = {"metadata.broker.list": "localhost:9092"}
article_stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
article_join_stream=article_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(x['id'].encode("utf-8") ,x))
article_join_stream.transform(storeDataToCassandra).pprint()
axes_topic = 'topic2'
axes_stream = KafkaUtils.createDirectStream(ssc, [axes_topic], kafkaParams)
axes_join_stream = axes_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(str(x['id']),x))
axes_join_stream.transform(axesStoreToCassandra).pprint()
statistics = article_join_stream.map(lambda x:(x[0])).window(15*60,15*60)
statistics.transform(joinstream).pprint()
ssc.start()
EDIT:
This is the stage that seems to consume most time. Any thoughts on that?
At first glance it seems that you just start your application with "spark-submit <your application>"
This means you are using the default allocation of memory and CPU's to your application (which is about 1cpu and 512MB of ram in most default cases)
This is assuming you are using YARN since you don't provide info on this.
Start your application with the appropriate resources and you'll see improvements.
Edit:
I see you are using a lot of lambdas, those need to be serialized.
Do know that when using objects you are passing around the full object every time.
I.E. you are using the full object this.value and not just value.
To fix this, you could use a local variable _value = this.value and use that to proceed.
This might provide you with a speedup.
Related
Getting "TypeError: cannot pickle '_thread.RLock' object" when using Broadcast Variable in Spark Cluster
The Spark Cluster has one Master and four Slaves. I have a small table in the database, so I read the table in Master node and set it as a Broadcast variable like this: def checkTableExists(tablename, connection): try: query = 'SELECT * FROM {0}'.format(str(tablename)) print(query) connection.cursor.execute(query) return True except odb.DatabaseError as e: x = e.args[0] if x.code == 942: ## Only catch ORA-00942: table or view does not exist error return False else: raise e def read_cheque_paper(spark, connection): if create_tables.checkTableExists(small_table, connection) == True: query_cheque = '''select * from {}'''.format(small_table) df = connection.read_sql(query_cheque) broadcastStates = spark.sparkContext.broadcast(df) else: df = spark.createDataFrame([], StructType([])) broadcastStates = spark.sparkContext.broadcast(df) return broadcastStates def trx_features(dataframe, broadcast_df): df_small = broadcast_df.value if df_small.empty: df = df.withColumn('field_check', lit(None).cast(StringType())) else: df = calculate_field_check(dataframe, df_small) df.select('field_check').where('field_check==1').show() return df def distribute_data(spark): connection = cx_Oracle.connect( config.username, config.password, config.dsn, encoding=config.encoding) if gf.flag_trx: source_table_name, table_name = gf.create_query('trx') broadcast_df = read_cheque_paper(spark, connection) predicates = list_predicates(connection, source_table_name, date_field) dataframe = gf.read_spark(spark, table_name, predicates, connection) df_trx = trx_features(dataframe, broadcast_df) write_spark(dist_table_name_trx, df_trx) return if __name__ == '__main__': master = "spark://IP master:7077" conf = SparkConf() \ .setAppName('ETL') \ .setMaster(master) \ .set("spark.jars", "ojdbc8.jar") sc = SparkContext.getOrCreate(conf=conf) sqlContext = SQLContext(sc) spark = sqlContext.sparkSession spark.sparkContext.setLogLevel('ERROR') start_time = datetime.datetime.now().timestamp() distribute_data(spark) end_time = datetime.datetime.now().timestamp() duration = end_time - start_time When this program run in local mode,I get the result without any errors. But, while I run that in the cluster,when it reach to the df_small = broadcast_df.value it was stopped with this error: TypeError: cannot pickle '_thread.RLock' object Would you please guide me what is wrong with defining the broadcast varible? Any help is really appreciated .
Process a 1/2 billion rows with PySpark creates shuffle read problems
I am apparently facing a read shuffle problems. My Pyspark Script is running on a Hadoop cluster 1 EdgeNode and 12 Datanodes, using YARN as resources manager and Spark 1.6.2. ###[ini_file containing conf spark] spark.app.name = MY_PYSPARK_APP spark.master = yarn-client spark.yarn.queue = agr_queue spark.executor.instances = 24 spark.executor.memory = 14 spark.executor.cores = 3 #spark.storage.memoryFraction = 0.5 #spark.sql.shuffle.partitions = 2001 #spark.sql.shuffle.partitions = 1000 spark.sql.shuffle.partitions = 100 spark.shuffle.memoryFraction=0.5 spark.memory.offHeap.enabled = True spark.serializer = org.apache.spark.serializer.KryoSerializer #spark.driver.memory = 14g spark.driver.maxResultSize = 20g spark.python.worker.memory = 14g spark.akka.heartbeat.interval = 100 spark.yarn.executor.memoryOverhead=2000 spark.yarn.driver.memoryOverhead=2000 spark.scheduler.mode = FIFO spark.sql.tungsten.enabled = True spark.default.parallelism = 200 spark.speculation = True spark.speculation.interval = 1000ms spark.speculation.multiplier = 2.0 Python script sconf = SparkConf() sc = SparkContext(sconf) hctx = HiveContext(sc) dataframe1 = hctx.sql("SELECT * FROM DB1.TABLE1") dataframe2 = hctx.sql("SELECT * FROM DB2.TABLE2") df = dataframe1.join(dataframe2, conditions) # No major problem at this count() # it returns 550 000 000 rows df.count() # 288 elements in List_dtm_t List_dtm_t=['00:00:00', '00:05:00', ... '23:45:00', '23:50:00', '23:55:00'] dat_tm_bdcst = sc.broadcast(List_dtm) global dat_tm_bdcst def mapper(row): import datetime def ts_minus_5(tmstmp): import datetime return tmstmp-datetime.timedelta(minutes=5) lst_tuple = () poids = row[9] for dtm in dat_tm_bdcst.value: t_minus = ts_minus_5(dtm) if (row[0]<=dtm) & (row[1]>t_minus): v1 = str(dtm) v2 = str(t_minus) v3 = row[2] v4 = row[3] v5 = row[4] v6 = row[5] v7 = row[6] v8 = row[7] v9 = row[8] v10 = row[10] v11 = poids * (min(dtm,row[1])-max(t_minus,row[0])).total_seconds() v12 = poids if row[0] <= dtm <= row[1] : v13 = poids else : v13 = 0 lst_tuple += (((v1, v2, v3, v4, v5, v6, v7, v8, v9, v10),(v11, v12, v13)),) return lst_tuple global list_to_row def list_to_row(keys, values): from pyspark.sql import Row row_dict = dict(zip(keys, values[0]+values[1])) return Row(**row_dict) f_reduce = lambda x,y: (x[0]+y[0], x[1]+y[1], x[2]+y[2]) # This flatMap takes a really infinite long time # It generally returns a KO because it retries more than 3 times # Or lose some shuffle path mapped_df = df.limit(10000000)\ .flatMap(mapper) reduced_rdd = mapped_df.reduceByKey(f_reduce) reduced_rdd.count() list_of_rows = reduced_rdd.map(lambda x: list_to_row(header, x)) df_to_exp = hctx.createDataFrame(list_of_rows) ## register as tempTable df_to_exp then write it into Hive I tried different ways like : Resolve skew problem using repartition([keys]) to distribute data by keys used by the reducer then Different values for spark.sql.shuffle.partitions, spark.default.parallelism and memoryOverhead conf A partial dataframe version using grouypBy Use persistence even if I pass over the data only one time I am looking for solution to reach the end and also speed up the process. Two screenshot of spark UI: List of Stages ReduceByKey Task We can see the ReduceByKey stage (don't know if it represents only the reduce task, with only 1 task ?!!) And the shuffle read /records which inscrease too slowly (300 000/100Millions after 13 minutes) Hope someone could help, Thanks !
value toDF is not a member of org.apache.spark.rdd.RDD[(Long, org.apache.spark.ml.linalg.Vector)]
Am getting a compilation error converting the pre-LDA transformation to a data frame using SCALA in SPARK 2.0. The specific code that is throwing an error is as per below: val documents = PreLDAmodel.transform(mp_listing_lda_df) .select("docId","features") .rdd .map{ case Row(row_num: Long, features: MLVector) => (row_num, features) } .toDF() The complete compilation error is: Error:(132, 8) value toDF is not a member of org.apache.spark.rdd.RDD[(Long, org.apache.spark.ml.linalg.Vector)] possible cause: maybe a semicolon is missing before `value toDF'? .toDF() Here is the complete code: import java.io.FileInputStream import java.sql.{DriverManager, ResultSet} import java.util.Properties import org.apache.spark.SparkConf import org.apache.spark.ml.Pipeline import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, RegexTokenizer, StopWordsRemover} import org.apache.spark.ml.linalg.{Vector => MLVector} import org.apache.spark.mllib.clustering.{LDA => oldLDA} import org.apache.spark.rdd.JdbcRDD import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{Row, SparkSession} object MPClassificationLDA { /*Start: Configuration variable initialization*/ val props = new Properties val fileStream = new FileInputStream("U:\\JIRA\\MP_Classification\\target\\classes\\mpclassification.properties") props.load(fileStream) val mpExtract = props.getProperty("mpExtract").toString val shard6_db_server_name = props.getProperty("shard6_db_server_name").toString val shard6_db_user_id = props.getProperty("shard6_db_user_id").toString val shard6_db_user_pwd = props.getProperty("shard6_db_user_pwd").toString val mp_output_file = props.getProperty("mp_output_file").toString val spark_warehouse_path = props.getProperty("spark_warehouse_path").toString val rf_model_file_path = props.getProperty("rf_model_file_path").toString val windows_hadoop_home = props.getProperty("windows_hadoop_home").toString val lda_vocabulary_size = props.getProperty("lda_vocabulary_size").toInt val pre_lda_model_file_path = props.getProperty("pre_lda_model_file_path").toString val lda_model_file_path = props.getProperty("lda_model_file_path").toString fileStream.close() /*End: Configuration variable initialization*/ val conf = new SparkConf().set("spark.sql.warehouse.dir", spark_warehouse_path) def main(arg: Array[String]): Unit = { //SQL Query definition and parameter values as parameter upon executing the Object val cont_id = "14211599" val top = "100000" val start_date = "2016-05-01" val end_date = "2016-06-01" val mp_spark = SparkSession .builder() .master("local[*]") .appName("MPClassificationLoadLDA") .config(conf) .getOrCreate() MPClassificationLDACalculation(mp_spark, cont_id, top, start_date, end_date) mp_spark.stop() } private def MPClassificationLDACalculation (mp_spark: SparkSession ,cont_id: String ,top: String ,start_date: String ,end_date: String ): Unit = { //DB connection definition def createConnection() = { Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver").newInstance(); DriverManager.getConnection("jdbc:sqlserver://" + shard6_db_server_name + ";user=" + shard6_db_user_id + ";password=" + shard6_db_user_pwd); } //DB Field Names definition def extractvalues(r: ResultSet) = { Row(r.getString(1),r.getString(2)) } //Prepare SQL Statement with parameter value replacement val query = """SELECT docId = audt_id, text = auction_title FROM brands6.dbo.uf_ds_marketplace_classification_listing(#cont_id, #top, '#start_date', '#end_date') WHERE ? < ? OPTION(RECOMPILE);""" .replaceAll("#cont_id", cont_id) .replaceAll("#top", top) .replaceAll("#start_date", start_date) .replaceAll("#end_date", end_date) .stripMargin //Connect to Source DB and execute the Prepared SQL Steatement val mpDataRDD = new JdbcRDD(mp_spark.sparkContext ,createConnection ,query ,lowerBound = 0 ,upperBound = 10000000 ,numPartitions = 1 ,mapRow = extractvalues) val schema_string = "docId,text" val fields = StructType(schema_string.split(",") .map(fieldname => StructField(fieldname, StringType, true))) //Create Data Frame using format identified through schema_string val mpDF = mp_spark.createDataFrame(mpDataRDD, fields) mpDF.collect() val mp_listing_tmp = mpDF.selectExpr("cast(docId as long) docId", "text") mp_listing_tmp.printSchema() println(mp_listing_tmp.first) val mp_listing_lda_df = mp_listing_tmp.withColumn("docId", mp_listing_tmp("docId")) mp_listing_lda_df.printSchema() val tokenizer = new RegexTokenizer() .setInputCol("text") .setOutputCol("rawTokens") .setMinTokenLength(2) val stopWordsRemover = new StopWordsRemover() .setInputCol("rawTokens") .setOutputCol("tokens") val vocabSize = 4000 val countVectorizer = new CountVectorizer() .setVocabSize(vocabSize) .setInputCol("tokens") .setOutputCol("features") val PreLDApipeline = new Pipeline() .setStages(Array(tokenizer, stopWordsRemover, countVectorizer)) val PreLDAmodel = PreLDApipeline.fit(mp_listing_lda_df) //comment out after saving it the first time PreLDAmodel.write.overwrite().save(pre_lda_model_file_path) val documents = PreLDAmodel.transform(mp_listing_lda_df) .select("docId","features") .rdd .map{ case Row(row_num: Long, features: MLVector) => (row_num, features) } .toDF() //documents.printSchema() val numTopics: Int = 20 val maxIterations: Int = 100 //note the FeaturesCol need to be set val lda = new LDA() .setOptimizer("em") .setK(numTopics) .setMaxIter(maxIterations) .setFeaturesCol(("_2")) val vocabArray = PreLDAmodel.stages(2).asInstanceOf[CountVectorizerModel].vocabulary } } Am thinking that it is related to conflicts in the imports section of the code. Appreciate any help.
2 things needed to be done: Import implicits: Note that this should be done only after an instance of org.apache.spark.sql.SQLContext is created. It should be written as: val sqlContext= new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ Move case class outside of the method: case class, by use of which you define the schema of the DataFrame, should be defined outside of the method needing it. You can read more about it here: https://issues.scala-lang.org/browse/SI-6649
Why my spark streaming app does not show any out put
This is my follow up question from my earlier stack overflow question ,for which I did not get a response I have tried writing this ,which does not throw up any error but it does not show any out put My goal is to evaluate the Dstream objects with historical data RDD ,I could not find any example for pyspark like this ( checking streaming RDD with static RDD created before hand ) .Appreciate your response . Thanks """ Created on Thu May 05 16:23:15 2016 #author: bghosh """ import re from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext,functions as func,Row sc = SparkContext("local[2]", "realtimeApp") sqlContext = SQLContext(sc) ssc = StreamingContext(sc,10) files = ssc.textFileStream("hdfs://RealTimeInputFolder/") ########Lets get the data from the db which is relavant for streaming ### driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver" dataurl = "jdbc:sqlserver://devserver:1433" db = "devDB" table = "stream_helper" credential = "dev_credential" ########basic data for evaluation purpose ######## #base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load() base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load() base_data.registerTempTable("base_data") ###### files_count = files.flatMap(lambda file: file.split( )) #pattern = '(TranAmount=Decimal.{2})(.[0-9]*.[0-9]*)(\\S+ )(TranDescription=u.)([a-zA-z\\s]+)([\\S\\s]+ )(dSc=u.)([A-Z]{2}.[0-9]+)' tranfiles = "wasb://vanspark01#vanspark01.blob.core.windows.net/RealTimeInputFolder01/" def getSqlContextInstance(sparkContext): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext) return globals()['sqlContextSingletonInstance'] def preparse(logline): #match = re.search(pattern,logline) pre = logline.split(",") return( Row( Customer_id = pre[-1], trantype = pre[-4], amount = float(pre[-5])) ) def parse(): parsed_tran = ssc.textFileStream(tranfiles).map(preparse) #success = parsed_tran.filter(lambda s: s[1] == 1).map(lambda x:x[0]) #fail = parsed_tran.filter(lambda s:s[1] == 0).map(lambda x:x[0]) """if fail.count() > 0: print "no of non parsed file : %d",fail.count() """ return parsed_tran#success def check_historic(rdd): #checking with the historical table # try: streamSqlcontext = getSqlContextInstance(rdd) stream_df = streamSqlcontext.createDataFrame(rdd) stream_df.registerTempTable("stream_df") result_data_frame = streamSqlcontext.sql("select * from stream_df LEFT OUTER JOIN base_data on stream_df.Customer_id= base_data.Customer_id" ) result_data_frame.show() except: pass #return result_data_frame.rdd success = parse() success.foreachRDD(check_historic) ssc.start() ssc.awaitTermination()
How to create InputDStream with offsets in PySpark (using KafkaUtils.createDirectStream)?
How to use KafkaUtils.createDirectStream with the offsets for a particular Topic in Pyspark?
If you want to create an RDD from records in a Kafka topic, use a static set of tuples. Make available all the imports from pyspark.streaming.kafka import KafkaUtils, OffsetRange Then you create a dictionary of Kafka Brokers kafkaParams = {"metadata.broker.list": "host1:9092,host2:9092,host3:9092"} Then you create your offsets object start = 0 until = 10 partition = 0 topic = 'topic' offset = OffsetRange(topic,partition,start,until) offsets = [offset] Finally you create the RDD: kafkaRDD = KafkaUtils.createRDD(sc, kafkaParams,offsets) To create Stream with offsets you need to do the following: from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition from pyspark.streaming import StreamingContext Then you create your sparkstreaming context using your sparkcontext ssc = StreamingContext(sc, 1) Next we set up all of our parameters kafkaParams = {"metadata.broker.list": "host1:9092,host2:9092,host3:9092"} start = 0 partition = 0 topic = 'topic' Then we create our fromOffset Dictionary topicPartion = TopicAndPartition(topic,partition) fromOffset = {topicPartion: long(start)} //notice that we must cast the int to long Finally we create the Stream directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic],kafkaParams, fromOffsets=fromOffset)
You can do: from pyspark.streaming.kafka import TopicAndPartition topic = "test" brokers = "localhost:9092" partition = 0 start = 0 topicpartion = TopicAndPartition(topic, partition) fromoffset = {topicpartion: int(start)} kafkaDStream = KafkaUtils.createDirectStream(spark_streaming,[topic], \ {"metadata.broker.list": brokers}, fromOffsets = fromoffset) Note: Spark 2.2.0, python 3.6