How to join a Stream RDD with a previous computed result in Spark Stream? - apache-spark

Now I am writing a Spark streaming program to detect the abnormal of network in a data center. I try to use regression algorithm. For example, I use the training data set to compute the model (i.e., the coef), and then how can I use this previous computed model in the data stream. I use the following join, but get the exception.
Traceback (most recent call last):
File "/home/xiuli/PycharmProjects/benchmark/parx.py", line 98, in <module>
joinedStream = testRDD.join(trainingRDD)
File "/opt/spark-1.4.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 362, in join
File "/opt/spark-1.4.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 313, in transformWith
AttributeError: 'PipelinedRDD' object has no attribute '_jdstream'
I could see Spark streaming guide give an example, but it lack of the details.
Stream-dataset joins
This has already been shown earlier while explain DStream.transform
operation. Here is yet another example of joining a windowed stream
with a dataset.
dataset = ... # some RDD
windowedStream = stream.window(20)
joinedStream = windowedStream.transform(lambda rdd: rdd.join(dataset))
Following is my code:
from __future__ import print_function
import sys,os,datetime
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.context import SQLContext
from pyspark.resultiterable import ResultIterable
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
import numpy as np
import statsmodels.api as sm
def splitLine(line, delimiter='|'):
values = line.split(delimiter)
st = datetime.datetime.strptime(values[1], '%Y-%m-%d %H:%M:%S')
return (values[0],st.hour), values[2:]
def reg_m(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def train(line):
y,x = [],[]
y, x = [],[[],[],[],[],[],[]]
reading_tmp,temp_tmp = [],[]
i = 0
for reading, temperature in line[1]:
if i%4==0 and len(reading_tmp)==4:
y.append(reading_tmp.pop())
x[0].append(reading_tmp.pop())
x[1].append(reading_tmp.pop())
x[2].append(reading_tmp.pop())
temp = float(temp_tmp[0])
del temp_tmp[:]
x[3].append(temp-20.0 if temp>20.0 else 0.0)
x[4].append(16.0-temp if temp<16.0 else 0.0)
x[5].append(5.0-temp if temp<5.0 else 0.0)
reading_tmp.append(float(reading))
temp_tmp.append(float(temperature))
i = i + 1
return str(line[0]),reg_m(y, x).params.tolist()
def detect(line):
y,x = [],[]
y, x = [],[[],[],[],[],[],[]]
reading_tmp,temp_tmp = [],[]
i = 0
for reading, temperature in line[1]:
if i%4==0 and len(reading_tmp)==4:
y.append(reading_tmp.pop())
x[0].append(reading_tmp.pop())
x[1].append(reading_tmp.pop())
x[2].append(reading_tmp.pop())
temp = float(temp_tmp[0])
del temp_tmp[:]
x[3].append(temp-20.0 if temp>20.0 else 0.0)
x[4].append(16.0-temp if temp<16.0 else 0.0)
x[5].append(5.0-temp if temp<5.0 else 0.0)
reading_tmp.append(float(reading))
temp_tmp.append(float(temperature))
i = i + 1
return line[0],reg_m(y, x).params.tolist()
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: parx.py <checkpointDir> <trainingDataDir> <streamDataDir>", file=sys.stderr)
exit(-1)
checkpoint, trainingInput, streamInput = sys.argv[1:]
sc = SparkContext("local[2]", appName="BenchmarkSparkStreaming")
trainingLines = sc.textFile(trainingInput)
trainingRDD = trainingLines.map(lambda line: splitLine(line, "|"))\
.groupByKey()\
.map(lambda line: train(line)).cache()
ssc = StreamingContext(sc, 1)
ssc.checkpoint(checkpoint)
lines = ssc.textFileStream(streamInput).map(lambda line: splitLine(line, "|"))
testRDD = lines.groupByKeyAndWindow(1,1).map(lambda line:(str(line[0]), line[1]))
joinedStream = testRDD.join(trainingRDD)
joinedStream.pprint(20)
ssc.start()
ssc.awaitTermination()

According to the documentation that you referred to, try:
testRDD.transform(lambda rdd: rdd.join(trainingRDD))

Related

Modify the PageRank Algorithm and run as stream application

Modify the application from the previous exercise so it reads all the files in the directory and (2) captures any new records/files as a streaming application.
Open the terminal and start the the streaming application
Open a new terminal and upload the files listed in this exercise using the 'copyFromLocal' command. Allow some intervals (for example 1-2 minutes) between the executions of the copyFromLocal command while observing the output of the other terminal's data streaming application. Notice how adding more files will change the rankings of the accounts.
My codes run as streaming (on AWS emr using putty), it is not adding the CSV I am adding to the 'new' folder. not providing a new rank.
pagerank alogrithm
import sys
from pyspark.sql import SparkSession
if __name__ == "__main__":
if len(sys.argv) < 4:
print >> sys.stderr
"Usage: PageRank.py <input-file> <output-file> <iterations>"
sys.exit()
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
def computeContribs(neighbors, rank):
for neighbor in neighbors:
yield(neighbor, rank/len(neighbors))
links = spark.sparkContext.textFile(sys.argv[1]).\
map(lambda line: line.split(',')).\
map(lambda pages: (pages[0], pages[1])).\
distinct().\
groupByKey().\
map(lambda x: (x[0], list(x[1])))
ranks = links.map(lambda element: (element[0], 1.0))
iterations = int(sys.argv[3])
for x in range(iterations):
contribs = links.join(ranks).flatMap(lambda row: computeContribs(row[1][0], row[1][1]))
print("\n")
print("------- Iter: " + str(x) + " --------")
ranks = contribs.reduceByKey(lambda v1, v2: v1+v2).map(lambda nr: (nr[0], nr[1] * 0.85 + 0.15))
for rank in ranks.collect(): print(rank)
print("\n")
print("------- Final Results --------")
for rank in ranks.collect(): print(rank)
ranks.saveAsTextFile(sys.argv[2])
spark.stop()
mycode
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":
sc = SparkContext(appName="Rank")
ssc = StreamingContext(sc, 10)
ssc.sparkContext.setLogLevel("WARN")
def computeContribs(neighbors, rank):
for neighbor in neighbors:
yield(neighbor, rank/len(neighbors))
links = ssc.textFileStream('new/').\
map(lambda line: line.split(',')).\
map(lambda pages: (pages[0], pages[1])).\
groupByKey().\
map(lambda x: (x[0], list(x[1])))
ranks = links.map(lambda element: (element[0], 1.0))
contribs = links.join(ranks).flatMap(lambda row: computeContribs(row[1][0], row[1][1]))
ranks = contribs.reduceByKey(lambda v1, v2: v1+v2).map(lambda nr: (nr[0], nr[1] * 0.85 + 0.15))
ranks.pprint()
ssc.start()
ssc.awaitTermination()

Creating a stream from a text file in Pyspark

I'm getting the following error when I try to create a stream from a text file in Pyspark:
TypeError: unbound method textFileStream() must be called with StreamingContext instance as first argument (got str instance instead)
I don't want to use SparkContext because I get another error so to remove thet error I have to use SparkSession.
My code:
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.mllib.stat import Statistics
if __name__ == "__main__":
spark = SparkSession.builder.appName("CrossCorrelation").getOrCreate()
ssc = StreamingContext(spark.sparkContext, 5)
input_path1 = sys.argv[1]
input_path2 = sys.argv[2]
ds1 = ssc.textFileStream(input_path1)
lines1 = ds1.map(lambda x1: x1[1])
windowedds1 = lines1.flatMap(lambda line1: line1.strip().split("\n")).map(lambda strelem1: float(strelem1)).window(5,10)
ds2 = ssc.textFileStream(input_path2)
lines2 = ds2.map(lambda x2: x2[1])
windowedds2 = lines2.flatMap(lambda line2: line2.strip().split("\n")).map(lambda strelem2: float(strelem2)).window(5,10)
result = Statistics.corr(windowedds1,windowedds2, method="pearson")
if result > 0.7:
print("ds1 and ds2 are correlated!!!")
spark.stop()
Thank you!
You have to first create streamingcontext object and then use it to call textFileStream.
spark =
SparkSession.builder.appName("CrossCorrelation").getOrCreate()
ssc = StreamingContext(spark.sparkContext, 1)
ds = ssc.textFileStream(input_path)

averaging of data using apache spark streaming

I'm using python
I'm receiving json dictionaries through kafka into spark stream,
JSON is like {"a":10}{"a":20} (One dict means one kafka message), key will be "a" always but how much dictionaries, that is not sure.
Now I want average of 10 and 20 in above case.
As per the my knowledge averageByKey may be useful.
But how to use, that I dont know.
Any help would be great!
Thank you for reading
.
.
.
.
.
Update
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
def createContext():
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_02")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 60)
kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming-consumer', {'':1})
raw = kafkaStream.map(lambda kafkaS: kafkaS[1])
clean = raw.map(lambda v: json.loads(v))
print (dir(clean))
clean.pprint()
add=clean.map(lambda xs: ('Total',xs['hello'])).reduceByKey(lambda a, b: a+b)
add.pprint()
count_var = clean.count()
count_var.pprint()
average = add.map(lambda tpl: tpl[1]/float(60))
average.pprint()
return ssc
if __name__ == "__main__":
ssc = StreamingContext.getOrCreate('/path/checkpoint_v'+sys.argv[1],lambda: createContext())
ssc.start()
ssc.awaitTermination()
Now
in above program I'm getting add.pprint() output as below for example:
Stream is like:
{u'hello': 26}
{u'hello': 28}
{u'hello': 31}
{u'hello': 35}
{u'hello': 40}
{u'hello': 46}
>('Total',206)
and output of count_var.pprint() as below for example:
> 6
The question is, in below line
> average = add.map(lambda tpl: tpl[1]/float(60))
I want to use value of count_var.pprint()(which is 6) instead of static value 60
So how can I use stream object as integer in above operation
First, you need to map your event to some processable type, for example tuple. Then you can use just classic "map -> reduceByKey -> map" to calculate the average like this:
import json
ssc = StreamingContext(spark.sparkContext, 1)
dstream = KafkaUtils.createDirectStream(ssc, ['topic'], client_configuration,
valueDecoder=lambda s: json.loads(s.decode('ascii')))
def map_event(raw):
item = list(raw[1].items())[0]
return (item[0], (1, item[1]))
dstream.map(map_event).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])) \
.map(lambda r: (r[0], float(r[1][1]) / r[1][0])) \
.pprint()
ssc.start()

Spark - Sort DStream by Key and limit to 5 values

I've started to learn spark and I wrote a pyspark streaming program to read stock data (symbol, volume) from port 3333.
Sample data streamed at 3333
"AAC",111113
"ABT",7451020
"ABBV",7325429
"ADPT",318617
"AET",1839122
"ALR",372777
"AGN",4170581
"ABC",3001798
"ANTM",1968246
I want to display the top 5 symbols based on volume. So I used a mapper to read each line, then split it by comma and reversed.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 5)
lines = ssc.socketTextStream("localhost", 3333)
stocks = lines.map(lambda line: sorted(line.split(','), reverse=True))
stocks.pprint()
Following is the output of stocks.pprint()
[u'111113', u'"AAC"']
[u'7451020', u'"ABT"']
[u'7325429', u'"ABBV"']
[u'318617', u'"ADPT"']
[u'1839122', u'"AET"']
[u'372777', u'"ALR"']
[u'4170581', u'"AGN"']
[u'3001798', u'"ABC"']
[u'1968246', u'"ANTM"']
I've got the following function in mind to display the stock symbols but not sure how to sort the stocks by key(volume) and then limit the function to display only first 5 values.
stocks.foreachRDD(processStocks)
def processStocks(stock):
for st in stock.collect():
print st[1]
Since stream represents an infinite sequence all you can do is sort each batch. First, you'll have to correctly parse the data:
lines = ssc.queueStream([sc.parallelize([
"AAC,111113", "ABT,7451020", "ABBV,7325429","ADPT,318617",
"AET,1839122", "ALR,372777", "AGN,4170581", "ABC,3001798",
"ANTM,1968246"
])])
def parse(line):
try:
k, v = line.split(",")
yield (k, int(v))
except ValueError:
pass
parsed = lines.flatMap(parse)
Next, sort each batch:
sorted_ = parsed.transform(
lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))
Finally, you can pprint top elements:
sorted_.pprint(5)
If all went well you should get output like below:
-------------------------------------------
Time: 2016-10-02 14:52:30
-------------------------------------------
('ABT', 7451020)
('ABBV', 7325429)
('AGN', 4170581)
('ABC', 3001798)
('ANTM', 1968246)
...
Depending on the size of a batch full sort can be prohibitively expensive. In that case you can take top and parallelize:
sorted_ = parsed.transform(lambda rdd: rdd.ctx.parallelize(rdd.top(5)))
or even reduceByKey:
from operator import itemgetter
import heapq
key = itemgetter(1)
def create_combiner(key=lambda x: x):
def _(x):
return [(key(x), x)]
return _
def merge_value(n=5, key=lambda x: x):
def _(acc, x):
heapq.heappush(acc, (key(x), x))
return heapq.nlargest(n, acc) if len(acc) > n else acc
return _
def merge_combiners(n=5):
def _(acc1, acc2):
merged = list(heapq.merge(acc1, acc2))
return heapq.nlargest(n, merged) if len(merged) > n else merged
return _
(parsed
.map(lambda x: (None, x))
.combineByKey(
create_combiner(key=key), merge_value(key=key), merge_combiners())
.flatMap(lambda x: x[1]))

Why my spark streaming app does not show any out put

This is my follow up question from my earlier stack overflow question ,for which I did not get a response
I have tried writing this ,which does not throw up any error but it does not show any out put
My goal is to evaluate the Dstream objects with historical data RDD ,I could not
find any example for pyspark like this ( checking streaming RDD with static RDD
created before hand ) .Appreciate your response . Thanks
"""
Created on Thu May 05 16:23:15 2016
#author: bghosh
"""
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext,functions as func,Row
sc = SparkContext("local[2]", "realtimeApp")
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc,10)
files = ssc.textFileStream("hdfs://RealTimeInputFolder/")
########Lets get the data from the db which is relavant for streaming ###
driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
dataurl = "jdbc:sqlserver://devserver:1433"
db = "devDB"
table = "stream_helper"
credential = "dev_credential"
########basic data for evaluation purpose ########
#base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data = sqlContext.read.format("jdbc").options(driver=driver,url=dataurl,database=db,user=credential,password=credential,dbtable=table).load()
base_data.registerTempTable("base_data")
######
files_count = files.flatMap(lambda file: file.split( ))
#pattern = '(TranAmount=Decimal.{2})(.[0-9]*.[0-9]*)(\\S+ )(TranDescription=u.)([a-zA-z\\s]+)([\\S\\s]+ )(dSc=u.)([A-Z]{2}.[0-9]+)'
tranfiles = "wasb://vanspark01#vanspark01.blob.core.windows.net/RealTimeInputFolder01/"
def getSqlContextInstance(sparkContext):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
def preparse(logline):
#match = re.search(pattern,logline)
pre = logline.split(",")
return(
Row(
Customer_id = pre[-1],
trantype = pre[-4],
amount = float(pre[-5]))
)
def parse():
parsed_tran = ssc.textFileStream(tranfiles).map(preparse)
#success = parsed_tran.filter(lambda s: s[1] == 1).map(lambda x:x[0])
#fail = parsed_tran.filter(lambda s:s[1] == 0).map(lambda x:x[0])
"""if fail.count() > 0:
print "no of non parsed file : %d",fail.count()
"""
return parsed_tran#success
def check_historic(rdd):
#checking with the historical table #
try:
streamSqlcontext = getSqlContextInstance(rdd)
stream_df = streamSqlcontext.createDataFrame(rdd)
stream_df.registerTempTable("stream_df")
result_data_frame = streamSqlcontext.sql("select * from stream_df LEFT OUTER JOIN base_data on stream_df.Customer_id= base_data.Customer_id" )
result_data_frame.show()
except:
pass
#return result_data_frame.rdd
success = parse()
success.foreachRDD(check_historic)
ssc.start()
ssc.awaitTermination()

Resources