Why my spark streaming app does not show any out put - apache-spark

This is my follow up question from my earlier stack overflow question ,for which I did not get a response
I have tried writing this ,which does not throw up any error but it does not show any out put
My goal is to evaluate the Dstream objects with historical data RDD ,I could not
find any example for pyspark like this ( checking streaming RDD with static RDD
created before hand ) .Appreciate your response . Thanks
"""
Created on Thu May 05 16:23:15 2016
#author: bghosh
"""
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext,functions as func,Row
sc = SparkContext("local[2]", "realtimeApp")
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc,10)
files = ssc.textFileStream("hdfs://RealTimeInputFolder/")
########Lets get the data from the db which is relavant for streaming ###
driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
dataurl = "jdbc:sqlserver://devserver:1433"
db = "devDB"
table = "stream_helper"
credential = "dev_credential"
########basic data for evaluation purpose ########
#base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data.registerTempTable("base_data")
######
files_count = files.flatMap(lambda file: file.split( ))
#pattern = '(TranAmount=Decimal.{2})(.[0-9]*.[0-9]*)(\\S+ )(TranDescription=u.)([a-zA-z\\s]+)([\\S\\s]+ )(dSc=u.)([A-Z]{2}.[0-9]+)'
tranfiles = "wasb://vanspark01#vanspark01.blob.core.windows.net/RealTimeInputFolder01/"
def getSqlContextInstance(sparkContext):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
def preparse(logline):
#match = re.search(pattern,logline)
pre = logline.split(",")
return(
Row(
Customer_id = pre[-1],
trantype = pre[-4],
amount = float(pre[-5]))
)
def parse():
parsed_tran = ssc.textFileStream(tranfiles).map(preparse)
#success = parsed_tran.filter(lambda s: s[1] == 1).map(lambda x:x[0])
#fail = parsed_tran.filter(lambda s:s[1] == 0).map(lambda x:x[0])
"""if fail.count() > 0:
print "no of non parsed file : %d",fail.count()
"""
return parsed_tran#success
def check_historic(rdd):
#checking with the historical table #
try:
streamSqlcontext = getSqlContextInstance(rdd)
stream_df = streamSqlcontext.createDataFrame(rdd)
stream_df.registerTempTable("stream_df")
result_data_frame = streamSqlcontext.sql("select * from stream_df LEFT OUTER JOIN base_data on stream_df.Customer_id= base_data.Customer_id" )
result_data_frame.show()
except:
pass
#return result_data_frame.rdd
success = parse()
success.foreachRDD(check_historic)
ssc.start()
ssc.awaitTermination()

Related

Getting "TypeError: cannot pickle '_thread.RLock' object" when using Broadcast Variable in Spark Cluster

The Spark Cluster has one Master and four Slaves. I have a small table in the database, so I read the table in Master node and set it as a Broadcast variable like this:
def checkTableExists(tablename, connection):
try:
query = 'SELECT * FROM {0}'.format(str(tablename))
print(query)
connection.cursor.execute(query)
return True
except odb.DatabaseError as e:
x = e.args[0]
if x.code == 942: ## Only catch ORA-00942: table or view does not exist error
return False
else:
raise e
def read_cheque_paper(spark, connection):
if create_tables.checkTableExists(small_table, connection) == True:
query_cheque = '''select * from {}'''.format(small_table)
df = connection.read_sql(query_cheque)
broadcastStates = spark.sparkContext.broadcast(df)
else:
df = spark.createDataFrame([], StructType([]))
broadcastStates = spark.sparkContext.broadcast(df)
return broadcastStates
def trx_features(dataframe, broadcast_df):
df_small = broadcast_df.value
if df_small.empty:
df = df.withColumn('field_check', lit(None).cast(StringType()))
else:
df = calculate_field_check(dataframe, df_small)
df.select('field_check').where('field_check==1').show()
return df
def distribute_data(spark):
connection = cx_Oracle.connect(
config.username,
config.password,
config.dsn,
encoding=config.encoding)
if gf.flag_trx:
source_table_name, table_name = gf.create_query('trx')
broadcast_df = read_cheque_paper(spark, connection)
predicates = list_predicates(connection, source_table_name, date_field)
dataframe = gf.read_spark(spark, table_name, predicates, connection)
df_trx = trx_features(dataframe, broadcast_df)
write_spark(dist_table_name_trx, df_trx)
return
if __name__ == '__main__':
master = "spark://IP master:7077"
conf = SparkConf() \
.setAppName('ETL') \
.setMaster(master) \
.set("spark.jars", "ojdbc8.jar")
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession
spark.sparkContext.setLogLevel('ERROR')
start_time = datetime.datetime.now().timestamp()
distribute_data(spark)
end_time = datetime.datetime.now().timestamp()
duration = end_time - start_time
When this program run in local mode,I get the result without any errors. But, while I run that in the cluster,when it reach to the df_small = broadcast_df.value it was stopped with this error:
TypeError: cannot pickle '_thread.RLock' object
Would you please guide me what is wrong with defining the broadcast varible?
Any help is really appreciated .

pyspark modify class attributes using spark.sql.rdd.foreach()

The main task is to connect Hive and read data using spark rdd.
I have tried the code below. Connection and reading are both successful, but when I want to modify the value of self.jobUserProfile, I failed. Then I print this value in three positions(masking in #1,#2 and #3). In the first position, the value is valid, but in the second and third position, the dict is empty. It seems that the modification has not been assigned into the class attribute.
I have tried response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10').collect() and iterate this dataframe, but when the data volume is too large, the performance may decline.
When I change response.rdd.foreach(lambda x: self.readLoginFunction(x)) to response.rdd.map(lambda x: self.readLoginFunction(x)), the target value in three position are all empty.
I'm a newbie in spark. Any advice could be helpful. Thanks in advance.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def __init__(self):
self.jobUserProfile = collections.defaultdict(dict)
def readLoginFunction(self, e):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
self.jobUserProfile[e[0]] = dic
print(self.jobUserProfile) #1
def readLogin(self, spark):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.foreach(lambda x: self.readLoginFunction(x))
print(self.jobUserProfile) #2
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
operateHive = OperateHive()
operateHive.readLogin(spark)
print(operateHive.jobUserProfile) #3
Finally the code below works.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile, devAppProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.top(1)[0][0]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
But when I remove devAppProfile, the code show like below:
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
operateHive.readLogin(spark, jobUserProfile, devAppProfile)
The rdd.map() won't work as there is no print in print(jobUserProfile).
Then I change the code like below, which works again.
from analysis.common.db.hive.connectHive import *
import collections
class OperateHive():
def readLoginFunction(self, e,jobUserProfile, devAppProfile):
dic = collections.defaultdict()
dic['userid'] = e[0]
dic['logtime'] = e[1]
jobUserProfile[e[0]] = dic
devAppProfile[e[0]] = dic
print(jobUserProfile)
return jobUserProfile
def readLogin(self, spark, jobUserProfile,devAppProfile):
response = spark.sql('select userid, logtime from hive.dwd_log_login_i_d limit 10')
rdd1 = response.rdd.map(lambda x: self.readLoginFunction(x, jobUserProfile, devAppProfile))
return rdd1.collect()[-1]
if __name__ == '__main__':
spark = connectHive(['conf/hdfs-site.xml', 'conf/hive-site.xml'], 'utf-8')
jobUserProfile = collections.defaultdict(dict)
devAppProfile = collections.defaultdict(dict)
operateHive = OperateHive()
jobUserProfile = operateHive.readLogin(spark, jobUserProfile, devAppProfile)
print(jobUserProfile)
The problem on the post is about closure. But I don't work out why the three versions on the answer work differently.

Bigquery CSV file load fail

google.api_core.exceptions.BadRequest: 400 Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
I am trying to run Python script that loads the data into csv but getting this error. can anyone explain me this error
import csv
#Imports the Google Cloud BigQuery client library
from google.cloud import bigquery
from google.cloud.bigquery import Dataset
from google.cloud.bigquery import Table
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import SchemaField
filename = 'events.csv'
idNeeded=0
#Instantiates a client
bigquery_client = bigquery.Client()
#Runs a query from BigQuery
def runBigQueryQuery( query, filename, idNeeded ):
if idNeeded == 1:
i = 1
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format(row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([i,row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
i = i+1
except AttributeError as error:
print('An error occured: {0}'.format(error))
else:
query_job = bigquery_client.query(query)
results = query_job.result()
with open (filename, 'w', newline='') as f: #Create CSV file
write = csv.writer(f,dialect='excel',lineterminator='\n')
try:
for row in results:
print('{},{},{},{},{},{},{},{},{},{},{},{},{},{},{} '.format( row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance))
write.writerow([row.EventId,
row.ScheduleId,
row.Date,
row.TimeFrom,
row.Description,
row.TimeTo,
row.ResourceId,
row.EmployeeId,
row.MovementTypeId,
row.Capacity,
row.CanBook,
row.NonMemberFlag,
row.MemberAmount,
row.NonMemberAmount,
row.Attendance]) #write Rows to CSV
except AttributeError as error:
print('An error occured: {0}'.format(error))
return
#Creates a dataset in BigQuery
def createDataset(datasetname):
dataset_ref = bigquery_client.dataset(datasetname)
dataset = Dataset(dataset_ref)
dataset.location = 'US'
dataset = bigquery_client.create_dataset(dataset)
return
def getDataset(datasetname):
dataset = bigquery_client.dataset(datasetname)
return dataset
def createTable(tablename, global_dataset_ref):
schema = [
#Enter Schema here.
# SchemaField('url', 'STRING', mode='required'),
# SchemaField('views', 'INTEGER', mode='required')
]
table_ref = global_dataset_ref.table(tablename)
table = Table(table_ref, schema=schema)
table = bigquery_client.create_table(table)
assert table.table_id == tablename
return
def getTable(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
# print(table.table_id)
print(table.schema)
# print(table.description)
# print(table.num_rows)
return table
def getTableSchema(tablename, global_dataset_ref):
table_ref = global_dataset_ref.table(tablename)
table = bigquery_client.get_table(table_ref)
schema = table.schema
return schema
def loadDataFromCSV(tablename, global_dataset_ref, filename):
schema = getTableSchema(tablename, global_dataset_ref)
table_ref = global_dataset_ref.table(tablename)
load_config = LoadJobConfig()
load_config.source_format = bigquery.SourceFormat.CSV
load_config.schema = schema
load_config.autodetect = True
load_config.allow_quoted_newlines = True
with open (filename, 'rb') as readable:
job = bigquery_client.load_table_from_file(readable, table_ref, location='US', job_config=load_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, global_dataset_ref, table_ref.table_id))
return
# Testing
if __name__ == '__main__':
datasetname = 'Data_Layer'
tablename = 'Events'
sqlquery = '''SELECT
null as EventId,
sc.scheduleid AS ScheduleId,
NULL AS Description,
sc.scheduledatefrom AS Date,
sc.timestart AS TimeFrom,
sc.timeduration AS TimeTo,
r.resourceid AS ResourceId,
sp.employeeid AS EmployeeId,
NULL AS MovementTypeId,
r.configheight AS Capacity,
CASE
WHEN st.schedulestatus IN (1, 3) THEN '1'
ELSE '0'
END CanBook,
CASE
WHEN sv.nonmembermayenroll = TRUE THEN '1'
ELSE '0'
END NonMemberFlag,
COALESCE(ProgramPrice.pricemember,
ServicePrice.pricemember,
0) AS MemberAmount,
COALESCE(ProgramPrice.pricenonmember,
ServicePrice.pricenonmember,
0) AS NonMemberAmount,
'N/A' AS Attendance
FROM
AloomaTest.SCSESSIONS s
LEFT JOIN
AloomaTest.SCSESSION_PROVIDERS sp
ON
sp.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSESSION_RESOURCES sr
ON
sr.sessionid = s.sessionid
LEFT JOIN
AloomaTest.SCSCHEDULES sc
ON
sc.scheduleid = s.scheduleid
LEFT JOIN
AloomaTest._SCSCHEDULESTATUS ST
ON
ST.schedulestatus = sc.schedulestatus
LEFT JOIN
AloomaTest.SCRESOURCES r
ON
r.resourceid = sr.resourceid
LEFT JOIN
AloomaTest.SCSERVICES sv
ON
sv.serviceid = sc.serviceid
LEFT JOIN
AloomaTest.SCPROGREG_SEMCOURSES semc
ON
semc.serviceid = sc.serviceid
AND semc.semesterid = sc.semesterid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ProgramPrice
ON
ProgramPrice.scheduleid = sc.scheduleid
LEFT JOIN
AloomaTest.SCPROGREG_PRICES ServicePrice
ON
ServicePrice.semcourseid = semc.semcourseid
WHERE
COALESCE(ProgramPrice.feetypeid,
0) = 0
AND COALESCE(ServicePrice.feetypeid,
0)= 0
and sc.scheduleid in(31207,
25936,
5761094,
832794,
9825,
17912)
'''
#createDataset(datasetname) #Successfully tested this code 2018-09-24
global_dataset_ref = getDataset(datasetname) #Successfully tested this code 2018-09-24
#createTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
getTable(tablename, global_dataset_ref) #Successfully tested this code 2018-09-24
runBigQueryQuery(sqlquery,filename,idNeeded) #Successfully tested this code 2018-09-24
loadDataFromCSV(tablename, global_dataset_ref,filename) #Successfully tested this code 2018-09-24
sample data
,25936,2009-06-01 18:30:00,1110,M1PO - M1 PT Full,60,,254,,,1,0,0,0,N/A
,17912,2009-04-22 06:15:00,375,Pil Ptnr - Pilates Partner,60,47,398,,10,1,1,0,0,N/A
,31207,2009-06-22 19:00:00,1140,D390-2 - 1 1/2 Hour Massage,90,107,548,,20,0,0,0,0,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,583,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,591,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,585,2349,,20,0,1,20,50,N/A
,5761094,2018-10-05 00:00:00,1140,Fr 7:00-9:00p Adult Paddle Mixer,120,584,2349,,20,0,1,20,50,N/A
,832794,2012-02-21 14:30:00,870,Comp Member One/One,60,,2963,,,1,0,0,0,N/A
The error message indicates that there is only 1 row in your CSV, you might be missing new lines while making it.

value toDF is not a member of org.apache.spark.rdd.RDD[(Long, org.apache.spark.ml.linalg.Vector)]

Am getting a compilation error converting the pre-LDA transformation to a data frame using SCALA in SPARK 2.0. The specific code that is throwing an error is as per below:
val documents = PreLDAmodel.transform(mp_listing_lda_df)
.select("docId","features")
.rdd
.map{ case Row(row_num: Long, features: MLVector) => (row_num, features) }
.toDF()
The complete compilation error is:
Error:(132, 8) value toDF is not a member of org.apache.spark.rdd.RDD[(Long, org.apache.spark.ml.linalg.Vector)]
possible cause: maybe a semicolon is missing before `value toDF'?
.toDF()
Here is the complete code:
import java.io.FileInputStream
import java.sql.{DriverManager, ResultSet}
import java.util.Properties
import org.apache.spark.SparkConf
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.clustering.LDA
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, RegexTokenizer, StopWordsRemover}
import org.apache.spark.ml.linalg.{Vector => MLVector}
import org.apache.spark.mllib.clustering.{LDA => oldLDA}
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
object MPClassificationLDA {
/*Start: Configuration variable initialization*/
val props = new Properties
val fileStream = new FileInputStream("U:\\JIRA\\MP_Classification\\target\\classes\\mpclassification.properties")
props.load(fileStream)
val mpExtract = props.getProperty("mpExtract").toString
val shard6_db_server_name = props.getProperty("shard6_db_server_name").toString
val shard6_db_user_id = props.getProperty("shard6_db_user_id").toString
val shard6_db_user_pwd = props.getProperty("shard6_db_user_pwd").toString
val mp_output_file = props.getProperty("mp_output_file").toString
val spark_warehouse_path = props.getProperty("spark_warehouse_path").toString
val rf_model_file_path = props.getProperty("rf_model_file_path").toString
val windows_hadoop_home = props.getProperty("windows_hadoop_home").toString
val lda_vocabulary_size = props.getProperty("lda_vocabulary_size").toInt
val pre_lda_model_file_path = props.getProperty("pre_lda_model_file_path").toString
val lda_model_file_path = props.getProperty("lda_model_file_path").toString
fileStream.close()
/*End: Configuration variable initialization*/
val conf = new SparkConf().set("spark.sql.warehouse.dir", spark_warehouse_path)
def main(arg: Array[String]): Unit = {
//SQL Query definition and parameter values as parameter upon executing the Object
val cont_id = "14211599"
val top = "100000"
val start_date = "2016-05-01"
val end_date = "2016-06-01"
val mp_spark = SparkSession
.builder()
.master("local[*]")
.appName("MPClassificationLoadLDA")
.config(conf)
.getOrCreate()
MPClassificationLDACalculation(mp_spark, cont_id, top, start_date, end_date)
mp_spark.stop()
}
private def MPClassificationLDACalculation
(mp_spark: SparkSession
,cont_id: String
,top: String
,start_date: String
,end_date: String
): Unit = {
//DB connection definition
def createConnection() = {
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver").newInstance();
DriverManager.getConnection("jdbc:sqlserver://" + shard6_db_server_name + ";user=" + shard6_db_user_id + ";password=" + shard6_db_user_pwd);
}
//DB Field Names definition
def extractvalues(r: ResultSet) = {
Row(r.getString(1),r.getString(2))
}
//Prepare SQL Statement with parameter value replacement
val query = """SELECT docId = audt_id, text = auction_title FROM brands6.dbo.uf_ds_marketplace_classification_listing(#cont_id, #top, '#start_date', '#end_date') WHERE ? < ? OPTION(RECOMPILE);"""
.replaceAll("#cont_id", cont_id)
.replaceAll("#top", top)
.replaceAll("#start_date", start_date)
.replaceAll("#end_date", end_date)
.stripMargin
//Connect to Source DB and execute the Prepared SQL Steatement
val mpDataRDD = new JdbcRDD(mp_spark.sparkContext
,createConnection
,query
,lowerBound = 0
,upperBound = 10000000
,numPartitions = 1
,mapRow = extractvalues)
val schema_string = "docId,text"
val fields = StructType(schema_string.split(",")
.map(fieldname => StructField(fieldname, StringType, true)))
//Create Data Frame using format identified through schema_string
val mpDF = mp_spark.createDataFrame(mpDataRDD, fields)
mpDF.collect()
val mp_listing_tmp = mpDF.selectExpr("cast(docId as long) docId", "text")
mp_listing_tmp.printSchema()
println(mp_listing_tmp.first)
val mp_listing_lda_df = mp_listing_tmp.withColumn("docId", mp_listing_tmp("docId"))
mp_listing_lda_df.printSchema()
val tokenizer = new RegexTokenizer()
.setInputCol("text")
.setOutputCol("rawTokens")
.setMinTokenLength(2)
val stopWordsRemover = new StopWordsRemover()
.setInputCol("rawTokens")
.setOutputCol("tokens")
val vocabSize = 4000
val countVectorizer = new CountVectorizer()
.setVocabSize(vocabSize)
.setInputCol("tokens")
.setOutputCol("features")
val PreLDApipeline = new Pipeline()
.setStages(Array(tokenizer, stopWordsRemover, countVectorizer))
val PreLDAmodel = PreLDApipeline.fit(mp_listing_lda_df)
//comment out after saving it the first time
PreLDAmodel.write.overwrite().save(pre_lda_model_file_path)
val documents = PreLDAmodel.transform(mp_listing_lda_df)
.select("docId","features")
.rdd
.map{ case Row(row_num: Long, features: MLVector) => (row_num, features) }
.toDF()
//documents.printSchema()
val numTopics: Int = 20
val maxIterations: Int = 100
//note the FeaturesCol need to be set
val lda = new LDA()
.setOptimizer("em")
.setK(numTopics)
.setMaxIter(maxIterations)
.setFeaturesCol(("_2"))
val vocabArray = PreLDAmodel.stages(2).asInstanceOf[CountVectorizerModel].vocabulary
}
}
Am thinking that it is related to conflicts in the imports section of the code. Appreciate any help.
2 things needed to be done:
Import implicits: Note that this should be done only after an instance of org.apache.spark.sql.SQLContext is created. It should be written as:
val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
Move case class outside of the method: case class, by use of which you define the schema of the DataFrame, should be defined outside of the method needing it. You can read more about it here: https://issues.scala-lang.org/browse/SI-6649

How to join a Stream RDD with a previous computed result in Spark Stream?

Now I am writing a Spark streaming program to detect the abnormal of network in a data center. I try to use regression algorithm. For example, I use the training data set to compute the model (i.e., the coef), and then how can I use this previous computed model in the data stream. I use the following join, but get the exception.
Traceback (most recent call last):
File "/home/xiuli/PycharmProjects/benchmark/parx.py", line 98, in <module>
joinedStream = testRDD.join(trainingRDD)
File "/opt/spark-1.4.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 362, in join
File "/opt/spark-1.4.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 313, in transformWith
AttributeError: 'PipelinedRDD' object has no attribute '_jdstream'
I could see Spark streaming guide give an example, but it lack of the details.
Stream-dataset joins
This has already been shown earlier while explain DStream.transform
operation. Here is yet another example of joining a windowed stream
with a dataset.
dataset = ... # some RDD
windowedStream = stream.window(20)
joinedStream = windowedStream.transform(lambda rdd: rdd.join(dataset))
Following is my code:
from __future__ import print_function
import sys,os,datetime
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.context import SQLContext
from pyspark.resultiterable import ResultIterable
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
import numpy as np
import statsmodels.api as sm
def splitLine(line, delimiter='|'):
values = line.split(delimiter)
st = datetime.datetime.strptime(values[1], '%Y-%m-%d %H:%M:%S')
return (values[0],st.hour), values[2:]
def reg_m(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def train(line):
y,x = [],[]
y, x = [],[[],[],[],[],[],[]]
reading_tmp,temp_tmp = [],[]
i = 0
for reading, temperature in line[1]:
if i%4==0 and len(reading_tmp)==4:
y.append(reading_tmp.pop())
x[0].append(reading_tmp.pop())
x[1].append(reading_tmp.pop())
x[2].append(reading_tmp.pop())
temp = float(temp_tmp[0])
del temp_tmp[:]
x[3].append(temp-20.0 if temp>20.0 else 0.0)
x[4].append(16.0-temp if temp<16.0 else 0.0)
x[5].append(5.0-temp if temp<5.0 else 0.0)
reading_tmp.append(float(reading))
temp_tmp.append(float(temperature))
i = i + 1
return str(line[0]),reg_m(y, x).params.tolist()
def detect(line):
y,x = [],[]
y, x = [],[[],[],[],[],[],[]]
reading_tmp,temp_tmp = [],[]
i = 0
for reading, temperature in line[1]:
if i%4==0 and len(reading_tmp)==4:
y.append(reading_tmp.pop())
x[0].append(reading_tmp.pop())
x[1].append(reading_tmp.pop())
x[2].append(reading_tmp.pop())
temp = float(temp_tmp[0])
del temp_tmp[:]
x[3].append(temp-20.0 if temp>20.0 else 0.0)
x[4].append(16.0-temp if temp<16.0 else 0.0)
x[5].append(5.0-temp if temp<5.0 else 0.0)
reading_tmp.append(float(reading))
temp_tmp.append(float(temperature))
i = i + 1
return line[0],reg_m(y, x).params.tolist()
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: parx.py <checkpointDir> <trainingDataDir> <streamDataDir>", file=sys.stderr)
exit(-1)
checkpoint, trainingInput, streamInput = sys.argv[1:]
sc = SparkContext("local[2]", appName="BenchmarkSparkStreaming")
trainingLines = sc.textFile(trainingInput)
trainingRDD = trainingLines.map(lambda line: splitLine(line, "|"))\
.groupByKey()\
.map(lambda line: train(line)).cache()
ssc = StreamingContext(sc, 1)
ssc.checkpoint(checkpoint)
lines = ssc.textFileStream(streamInput).map(lambda line: splitLine(line, "|"))
testRDD = lines.groupByKeyAndWindow(1,1).map(lambda line:(str(line[0]), line[1]))
joinedStream = testRDD.join(trainingRDD)
joinedStream.pprint(20)
ssc.start()
ssc.awaitTermination()
According to the documentation that you referred to, try:
testRDD.transform(lambda rdd: rdd.join(trainingRDD))

Resources