Modify the PageRank Algorithm and run as stream application - apache-spark

Modify the application from the previous exercise so it reads all the files in the directory and (2) captures any new records/files as a streaming application.
Open the terminal and start the the streaming application
Open a new terminal and upload the files listed in this exercise using the 'copyFromLocal' command. Allow some intervals (for example 1-2 minutes) between the executions of the copyFromLocal command while observing the output of the other terminal's data streaming application. Notice how adding more files will change the rankings of the accounts.
My codes run as streaming (on AWS emr using putty), it is not adding the CSV I am adding to the 'new' folder. not providing a new rank.
pagerank alogrithm
import sys
from pyspark.sql import SparkSession
if __name__ == "__main__":
if len(sys.argv) < 4:
print >> sys.stderr
"Usage: PageRank.py <input-file> <output-file> <iterations>"
sys.exit()
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
def computeContribs(neighbors, rank):
for neighbor in neighbors:
yield(neighbor, rank/len(neighbors))
links = spark.sparkContext.textFile(sys.argv[1]).\
map(lambda line: line.split(',')).\
map(lambda pages: (pages[0], pages[1])).\
distinct().\
groupByKey().\
map(lambda x: (x[0], list(x[1])))
ranks = links.map(lambda element: (element[0], 1.0))
iterations = int(sys.argv[3])
for x in range(iterations):
contribs = links.join(ranks).flatMap(lambda row: computeContribs(row[1][0], row[1][1]))
print("\n")
print("------- Iter: " + str(x) + " --------")
ranks = contribs.reduceByKey(lambda v1, v2: v1+v2).map(lambda nr: (nr[0], nr[1] * 0.85 + 0.15))
for rank in ranks.collect(): print(rank)
print("\n")
print("------- Final Results --------")
for rank in ranks.collect(): print(rank)
ranks.saveAsTextFile(sys.argv[2])
spark.stop()
mycode
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":
sc = SparkContext(appName="Rank")
ssc = StreamingContext(sc, 10)
ssc.sparkContext.setLogLevel("WARN")
def computeContribs(neighbors, rank):
for neighbor in neighbors:
yield(neighbor, rank/len(neighbors))
links = ssc.textFileStream('new/').\
map(lambda line: line.split(',')).\
map(lambda pages: (pages[0], pages[1])).\
groupByKey().\
map(lambda x: (x[0], list(x[1])))
ranks = links.map(lambda element: (element[0], 1.0))
contribs = links.join(ranks).flatMap(lambda row: computeContribs(row[1][0], row[1][1]))
ranks = contribs.reduceByKey(lambda v1, v2: v1+v2).map(lambda nr: (nr[0], nr[1] * 0.85 + 0.15))
ranks.pprint()
ssc.start()
ssc.awaitTermination()

Related

How to direct stream(kafka) a JSON file in spark and convert it into RDD?

Wrote a code that direct streams(kafka) word count when file is given(in producer)
code :
from pyspark import SparkConf, SparkContext
from operator import add
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
## Constants
APP_NAME = "PythonStreamingDirectKafkaWordCount"
##OTHER FUNCTIONS/CLASSES
def main():
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 2)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
main()
Need to convert the input json file to spark Dataframe using Dstream.
This should work:
Once you have your variable containing the TransformedDStream kvs, you can just create a map of DStreams and pass the data to a handler function like this:
data = kvs.map( lambda tuple: tuple[1] )
data.foreachRDD( lambda yourRdd: readMyRddsFromKafkaStream( yourRdd ) )
You should define the handler function that should create the dataframe using your JSON data:
def readMyRddsFromKafkaStream( readRdd ):
# Put RDD into a Dataframe
df = spark.read.json( readRdd )
df.registerTempTable( "temporary_table" )
df = spark.sql( """
SELECT
*
FROM
temporary_table
""" )
df.show()
Hope it helps my friends :)

How to process multiple Spark SQL queries in parallel [duplicate]

I am trying to run 2 functions doing completely independent transformations on a single RDD in parallel using PySpark. What are some methods to do the same?
def doXTransforms(sampleRDD):
(X transforms)
def doYTransforms(sampleRDD):
(Y Transforms)
if __name__ == "__main__":
sc = SparkContext(appName="parallelTransforms")
sqlContext = SQLContext(sc)
hive_context = HiveContext(sc)
rows_rdd = hive_context.sql("select * from tables.X_table")
p1 = Process(target=doXTransforms , args=(rows_rdd,))
p1.start()
p2 = Process(target=doYTransforms, args=(rows_rdd,))
p2.start()
p1.join()
p2.join()
sc.stop()
This does not work and I now understand this will not work.
But is there any alternative way to make this work? Specifically are there any python-spark specific solutions?
Just use threads and make sure that cluster have enough resources to process both tasks at the same time.
from threading import Thread
import time
def process(rdd, f):
def delay(x):
time.sleep(1)
return f(x)
return rdd.map(delay).sum()
rdd = sc.parallelize(range(100), int(sc.defaultParallelism / 2))
t1 = Thread(target=process, args=(rdd, lambda x: x * 2))
t2 = Thread(target=process, args=(rdd, lambda x: x + 1))
t1.start(); t2.start()
Arguably this is not that often useful in practice but otherwise should work just fine.
You can further use in-application scheduling with FAIR scheduler and scheduler pools for a better control over execution strategy.
You can also try pyspark-asyncactions (disclaimer - the author of this answer is also the author of the package) which provides a set of wrappers around Spark API and concurrent.futures:
import asyncactions
import concurrent.futures
f1 = rdd.filter(lambda x: x % 3 == 0).countAsync()
f2 = rdd.filter(lambda x: x % 11 == 0).countAsync()
[x.result() for x in concurrent.futures.as_completed([f1, f2])]

averaging of data using apache spark streaming

I'm using python
I'm receiving json dictionaries through kafka into spark stream,
JSON is like {"a":10}{"a":20} (One dict means one kafka message), key will be "a" always but how much dictionaries, that is not sure.
Now I want average of 10 and 20 in above case.
As per the my knowledge averageByKey may be useful.
But how to use, that I dont know.
Any help would be great!
Thank you for reading
.
.
.
.
.
Update
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
def createContext():
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_02")
sc.setLogLevel("WARN")
ssc = StreamingContext(sc, 60)
kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming-consumer', {'':1})
raw = kafkaStream.map(lambda kafkaS: kafkaS[1])
clean = raw.map(lambda v: json.loads(v))
print (dir(clean))
clean.pprint()
add=clean.map(lambda xs: ('Total',xs['hello'])).reduceByKey(lambda a, b: a+b)
add.pprint()
count_var = clean.count()
count_var.pprint()
average = add.map(lambda tpl: tpl[1]/float(60))
average.pprint()
return ssc
if __name__ == "__main__":
ssc = StreamingContext.getOrCreate('/path/checkpoint_v'+sys.argv[1],lambda: createContext())
ssc.start()
ssc.awaitTermination()
Now
in above program I'm getting add.pprint() output as below for example:
Stream is like:
{u'hello': 26}
{u'hello': 28}
{u'hello': 31}
{u'hello': 35}
{u'hello': 40}
{u'hello': 46}
>('Total',206)
and output of count_var.pprint() as below for example:
> 6
The question is, in below line
> average = add.map(lambda tpl: tpl[1]/float(60))
I want to use value of count_var.pprint()(which is 6) instead of static value 60
So how can I use stream object as integer in above operation
First, you need to map your event to some processable type, for example tuple. Then you can use just classic "map -> reduceByKey -> map" to calculate the average like this:
import json
ssc = StreamingContext(spark.sparkContext, 1)
dstream = KafkaUtils.createDirectStream(ssc, ['topic'], client_configuration,
valueDecoder=lambda s: json.loads(s.decode('ascii')))
def map_event(raw):
item = list(raw[1].items())[0]
return (item[0], (1, item[1]))
dstream.map(map_event).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])) \
.map(lambda r: (r[0], float(r[1][1]) / r[1][0])) \
.pprint()
ssc.start()

Spark - Sort DStream by Key and limit to 5 values

I've started to learn spark and I wrote a pyspark streaming program to read stock data (symbol, volume) from port 3333.
Sample data streamed at 3333
"AAC",111113
"ABT",7451020
"ABBV",7325429
"ADPT",318617
"AET",1839122
"ALR",372777
"AGN",4170581
"ABC",3001798
"ANTM",1968246
I want to display the top 5 symbols based on volume. So I used a mapper to read each line, then split it by comma and reversed.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 5)
lines = ssc.socketTextStream("localhost", 3333)
stocks = lines.map(lambda line: sorted(line.split(','), reverse=True))
stocks.pprint()
Following is the output of stocks.pprint()
[u'111113', u'"AAC"']
[u'7451020', u'"ABT"']
[u'7325429', u'"ABBV"']
[u'318617', u'"ADPT"']
[u'1839122', u'"AET"']
[u'372777', u'"ALR"']
[u'4170581', u'"AGN"']
[u'3001798', u'"ABC"']
[u'1968246', u'"ANTM"']
I've got the following function in mind to display the stock symbols but not sure how to sort the stocks by key(volume) and then limit the function to display only first 5 values.
stocks.foreachRDD(processStocks)
def processStocks(stock):
for st in stock.collect():
print st[1]
Since stream represents an infinite sequence all you can do is sort each batch. First, you'll have to correctly parse the data:
lines = ssc.queueStream([sc.parallelize([
"AAC,111113", "ABT,7451020", "ABBV,7325429","ADPT,318617",
"AET,1839122", "ALR,372777", "AGN,4170581", "ABC,3001798",
"ANTM,1968246"
])])
def parse(line):
try:
k, v = line.split(",")
yield (k, int(v))
except ValueError:
pass
parsed = lines.flatMap(parse)
Next, sort each batch:
sorted_ = parsed.transform(
lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))
Finally, you can pprint top elements:
sorted_.pprint(5)
If all went well you should get output like below:
-------------------------------------------
Time: 2016-10-02 14:52:30
-------------------------------------------
('ABT', 7451020)
('ABBV', 7325429)
('AGN', 4170581)
('ABC', 3001798)
('ANTM', 1968246)
...
Depending on the size of a batch full sort can be prohibitively expensive. In that case you can take top and parallelize:
sorted_ = parsed.transform(lambda rdd: rdd.ctx.parallelize(rdd.top(5)))
or even reduceByKey:
from operator import itemgetter
import heapq
key = itemgetter(1)
def create_combiner(key=lambda x: x):
def _(x):
return [(key(x), x)]
return _
def merge_value(n=5, key=lambda x: x):
def _(acc, x):
heapq.heappush(acc, (key(x), x))
return heapq.nlargest(n, acc) if len(acc) > n else acc
return _
def merge_combiners(n=5):
def _(acc1, acc2):
merged = list(heapq.merge(acc1, acc2))
return heapq.nlargest(n, merged) if len(merged) > n else merged
return _
(parsed
.map(lambda x: (None, x))
.combineByKey(
create_combiner(key=key), merge_value(key=key), merge_combiners())
.flatMap(lambda x: x[1]))

How to join a Stream RDD with a previous computed result in Spark Stream?

Now I am writing a Spark streaming program to detect the abnormal of network in a data center. I try to use regression algorithm. For example, I use the training data set to compute the model (i.e., the coef), and then how can I use this previous computed model in the data stream. I use the following join, but get the exception.
Traceback (most recent call last):
File "/home/xiuli/PycharmProjects/benchmark/parx.py", line 98, in <module>
joinedStream = testRDD.join(trainingRDD)
File "/opt/spark-1.4.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 362, in join
File "/opt/spark-1.4.0-bin-hadoop2.6/python/lib/pyspark.zip/pyspark/streaming/dstream.py", line 313, in transformWith
AttributeError: 'PipelinedRDD' object has no attribute '_jdstream'
I could see Spark streaming guide give an example, but it lack of the details.
Stream-dataset joins
This has already been shown earlier while explain DStream.transform
operation. Here is yet another example of joining a windowed stream
with a dataset.
dataset = ... # some RDD
windowedStream = stream.window(20)
joinedStream = windowedStream.transform(lambda rdd: rdd.join(dataset))
Following is my code:
from __future__ import print_function
import sys,os,datetime
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql.context import SQLContext
from pyspark.resultiterable import ResultIterable
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
import numpy as np
import statsmodels.api as sm
def splitLine(line, delimiter='|'):
values = line.split(delimiter)
st = datetime.datetime.strptime(values[1], '%Y-%m-%d %H:%M:%S')
return (values[0],st.hour), values[2:]
def reg_m(y, x):
ones = np.ones(len(x[0]))
X = sm.add_constant(np.column_stack((x[0], ones)))
for ele in x[1:]:
X = sm.add_constant(np.column_stack((ele, X)))
results = sm.OLS(y, X).fit()
return results
def train(line):
y,x = [],[]
y, x = [],[[],[],[],[],[],[]]
reading_tmp,temp_tmp = [],[]
i = 0
for reading, temperature in line[1]:
if i%4==0 and len(reading_tmp)==4:
y.append(reading_tmp.pop())
x[0].append(reading_tmp.pop())
x[1].append(reading_tmp.pop())
x[2].append(reading_tmp.pop())
temp = float(temp_tmp[0])
del temp_tmp[:]
x[3].append(temp-20.0 if temp>20.0 else 0.0)
x[4].append(16.0-temp if temp<16.0 else 0.0)
x[5].append(5.0-temp if temp<5.0 else 0.0)
reading_tmp.append(float(reading))
temp_tmp.append(float(temperature))
i = i + 1
return str(line[0]),reg_m(y, x).params.tolist()
def detect(line):
y,x = [],[]
y, x = [],[[],[],[],[],[],[]]
reading_tmp,temp_tmp = [],[]
i = 0
for reading, temperature in line[1]:
if i%4==0 and len(reading_tmp)==4:
y.append(reading_tmp.pop())
x[0].append(reading_tmp.pop())
x[1].append(reading_tmp.pop())
x[2].append(reading_tmp.pop())
temp = float(temp_tmp[0])
del temp_tmp[:]
x[3].append(temp-20.0 if temp>20.0 else 0.0)
x[4].append(16.0-temp if temp<16.0 else 0.0)
x[5].append(5.0-temp if temp<5.0 else 0.0)
reading_tmp.append(float(reading))
temp_tmp.append(float(temperature))
i = i + 1
return line[0],reg_m(y, x).params.tolist()
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: parx.py <checkpointDir> <trainingDataDir> <streamDataDir>", file=sys.stderr)
exit(-1)
checkpoint, trainingInput, streamInput = sys.argv[1:]
sc = SparkContext("local[2]", appName="BenchmarkSparkStreaming")
trainingLines = sc.textFile(trainingInput)
trainingRDD = trainingLines.map(lambda line: splitLine(line, "|"))\
.groupByKey()\
.map(lambda line: train(line)).cache()
ssc = StreamingContext(sc, 1)
ssc.checkpoint(checkpoint)
lines = ssc.textFileStream(streamInput).map(lambda line: splitLine(line, "|"))
testRDD = lines.groupByKeyAndWindow(1,1).map(lambda line:(str(line[0]), line[1]))
joinedStream = testRDD.join(trainingRDD)
joinedStream.pprint(20)
ssc.start()
ssc.awaitTermination()
According to the documentation that you referred to, try:
testRDD.transform(lambda rdd: rdd.join(trainingRDD))

Resources