how to use mix two RDD with python - apache-spark

I have the following two RDDs,the first one is:
training2 = training.map(lambda x:(x[0],(x[1],x[2])))
training2.collect()
#[(u'1', (u'4298118681424644510', u'7686695')),
# (u'1', (u'4860571499428580850', u'21560664')),
# (u'1', (u'9704320783495875564', u'21748480')),
# (u'1', (u'13677630321509009335', u'3517124')),
and the second one is:
user_id2 = user_id.map(lambda x:(x[0],(x[1],x[2])))
user_id2.collect()
#[(u'1', (u'1', u'5')),
# (u'2', (u'2', u'3')),
# (u'3', (u'1', u'5')),
# (u'4', (u'1', u'3')),
# (u'5', (u'2', u'1')),
In both RDD the parameter u'1',u'2'... indicates de user id, so I need to mix both RDD by key, the output must combinate for every key be something like this:
u'1', (u'1', u'5', u'4298118681424644510', u'7686695')

How'about add two rdd and use aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None)
you can also use reduceByKey or groupByKey
for example
zero_value=set()
def seq_op(x, y) :
x.add(y)
return x
def comb_op(x, y) :
return x.union(y)
numbers = sc.parallelize([0,0,1,2,5,4,5,5,5]).map(lambda x : ["Even" if (x % 2 == 0) else "Odd", x])
numbers.collect()
numbers.aggregateByKey(zero_value, seq_op, comb_op).collect()
# results looks like [("Even", {0, 2, 4,}), ....]

Related

How get corresponding element in sub list based on input element from the same list python

I've list like this.
list=[[['name1','name2'],[n1,n2]], [['name3','name4'],[n3,n4]]]
I want to get n1 if input is name1
similarly if input if name3 then output should be n3
Note: name1-Type str
n1- Type int
Is there is any way to do this?..Pls suggest me solution/Solution steps that i can follow to solve this issue..
I see building an intermediate lookup dict from my_list, then looking up as you like:
my_list=[
[['name1','name2'],['n1','n2']],
[['name3','name4'],['n3','n4']]
]
lookup = {}
for double_tuple in my_list:
lhs, rhs = double_tuple
zipped = zip(lhs, rhs) # ['name1','name2'],['n1','n2'] → ['name1', 'n1'],['name2','n2']
lookup.update(dict(zipped))
print(lookup['name1']) # → 'n1'
It can be easily solved with a list comprehension:
unpack the elements in the list
filter for k1 == input
get first result, if exists
input_ = "name1"
list_ = [[['name1','name2'],[n1,n2]], [['name3','name4'],[n3,n4]]]
candidates = [v1
for (k1, _), (v1, _) in list_
if k1 == input_]
if len(candidates) == 0:
print("No such key: " + input_)
else:
print("Value is " + candidates[0])
Note: I used trailing underscores in the names to avoid overwriting builtin functions (list and input). Overwriting builtin functions is bad practice.
You can use filter combined with next:
def get_item_from_key(input_list, key):
"""Return item corresponding to a specific key"""
try:
return next(filter(lambda x: x[0][0] == key, input_list))[1][0]
except StopIteration:
return None
So, if the input list is a = [[['name1', 'name2'], [0, 1]], [['name3', 'name4'], [2, 3]]], you can ask for any key you are interested into:
get_item_from_key(a, 'name1') # this will return 0
get_item_from_key(a, 'name3') # this will return 2
get_item_from_key(a, 'name2') # this will return None
get_item_from_key(a, 'name5') # this will return None

Python 3, remove duplicate of tuples in other lists of lists

Hi im looking for a way to remove duplicate of tuples from one list when compared to tuples in others different lists and/or lists of lists of tuples.
Example :
possible_cell = [(7,7),(3,3),(4,4),(5,5)]
wrong_path = [(1,1),(1,2),[(3,3)]]
current_path = [(4,4),[(5,5)]]
this_path = [(6,6)]
wanted :
new_possible_cell = [2-tuple for 2-tuple in possible_cell if 2-tuple not in wrong_path and 2-tuple not in current_path etc....]
expected return :
new_possible_cell = [(7,7)]
You're close, just flatten the list first.
possible_cell = [(7,7),(3,3),(4,4),(5,5)]
wrong_path = [(1,1),(1,2),[(3,3)]]
current_path = [(4,4),[(5,5)]]
this_path = [(6,6)]
def flatten(L):
for x in L:
if type(x) == list:
yield from flatten(x)
else:
yield x
new_possible_cell = [x for x in possible_cell if x not in flatten(wrong_path+current_path+this_path)]
print(new_possible_cell)
output:
[(7, 7)]
If your lists are large, use set(flatten(...)) for better speed.

How to do properly a full Outer Join of two RDDs with PySpark?

I'm looking for a way to combine two RDDs by key.
Given :
x = sc.parallelize([('_guid_YWKnKkcrg_Ej0icb07bhd-mXPjw-FcPi764RRhVrOxE=', 'FR', '75001'),
('_guid_XblBPCaB8qx9SK3D4HuAZwO-1cuBPc1GgfgNUC2PYm4=', 'TN', '8160'),
]
)
y = sc.parallelize([('_guid_oX6Lu2xxHtA_T93sK6igyW5RaHH1tAsWcF0RpNx_kUQ=', 'JmJCFu3N'),
('_guid_hG88Yt5EUsqT8a06Cy380ga3XHPwaFylNyuvvqDslCw=', 'KNPQLQth'),
('_guid_YWKnKkcrg_Ej0icb07bhd-mXPjw-FcPi764RRhVrOxE=', 'KlGZj08d'),
]
)
I found a solution ! Nevertheless, this solution is not entirely satisfactory for what I want to do.
I created a function in order to specify my key which will be applied to my rdd named "x" :
def get_keys(rdd):
new_x = rdd.map(lambda item: (item[0], (item[1], item[2])))
return new_x
new_x = get_keys(x)
which gives :
[('_guid_YWKnKkcrg_Ej0icb07bhd-mXPjw-FcPi764RRhVrOxE=', ('FR', '75001')),
('_guid_XblBPCaB8qx9SK3D4HuAZwO-1cuBPc1GgfgNUC2PYm4=', ('TN', '8160'))]
Then :
new_x.union(y).map(lambda (x, y): (x, [y])).reduceByKey(lambda p, q : p + q).collect()
The result :
[('_guid_oX6Lu2xxHtA_T93sK6igyW5RaHH1tAsWcF0RpNx_kUQ=', ['JmJCFu3N']),
('_guid_YWKnKkcrg_Ej0icb07bhd-mXPjw-FcPi764RRhVrOxE=', [('FR', '75001'), 'KlGZj08d']),
('_guid_XblBPCaB8qx9SK3D4HuAZwO-1cuBPc1GgfgNUC2PYm4=', [('TN', '8160')]),
('_guid_hG88Yt5EUsqT8a06Cy380ga3XHPwaFylNyuvvqDslCw=', ['KNPQLQth'])]
What I want to have is :
[('_guid_oX6Lu2xxHtA_T93sK6igyW5RaHH1tAsWcF0RpNx_kUQ=', (None, None, 'JmJCFu3N')),
('_guid_YWKnKkcrg_Ej0icb07bhd-mXPjw-FcPi764RRhVrOxE=', ('FR', '75001', 'KlGZj08d')),
('_guid_XblBPCaB8qx9SK3D4HuAZwO-1cuBPc1GgfgNUC2PYm4=', ('TN', '8160', None)),
('_guid_hG88Yt5EUsqT8a06Cy380ga3XHPwaFylNyuvvqDslCw=', (None, None, 'KNPQLQth'))]
Help ?
Why not?
>>> new_x.fullOuterJoin(y)
or
>>> x.toDF().join(y.toDF(), ["_1"], "fullouter").rdd

Spark - Sort DStream by Key and limit to 5 values

I've started to learn spark and I wrote a pyspark streaming program to read stock data (symbol, volume) from port 3333.
Sample data streamed at 3333
"AAC",111113
"ABT",7451020
"ABBV",7325429
"ADPT",318617
"AET",1839122
"ALR",372777
"AGN",4170581
"ABC",3001798
"ANTM",1968246
I want to display the top 5 symbols based on volume. So I used a mapper to read each line, then split it by comma and reversed.
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext("local[2]", "NetworkWordCount")
ssc = StreamingContext(sc, 5)
lines = ssc.socketTextStream("localhost", 3333)
stocks = lines.map(lambda line: sorted(line.split(','), reverse=True))
stocks.pprint()
Following is the output of stocks.pprint()
[u'111113', u'"AAC"']
[u'7451020', u'"ABT"']
[u'7325429', u'"ABBV"']
[u'318617', u'"ADPT"']
[u'1839122', u'"AET"']
[u'372777', u'"ALR"']
[u'4170581', u'"AGN"']
[u'3001798', u'"ABC"']
[u'1968246', u'"ANTM"']
I've got the following function in mind to display the stock symbols but not sure how to sort the stocks by key(volume) and then limit the function to display only first 5 values.
stocks.foreachRDD(processStocks)
def processStocks(stock):
for st in stock.collect():
print st[1]
Since stream represents an infinite sequence all you can do is sort each batch. First, you'll have to correctly parse the data:
lines = ssc.queueStream([sc.parallelize([
"AAC,111113", "ABT,7451020", "ABBV,7325429","ADPT,318617",
"AET,1839122", "ALR,372777", "AGN,4170581", "ABC,3001798",
"ANTM,1968246"
])])
def parse(line):
try:
k, v = line.split(",")
yield (k, int(v))
except ValueError:
pass
parsed = lines.flatMap(parse)
Next, sort each batch:
sorted_ = parsed.transform(
lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False))
Finally, you can pprint top elements:
sorted_.pprint(5)
If all went well you should get output like below:
-------------------------------------------
Time: 2016-10-02 14:52:30
-------------------------------------------
('ABT', 7451020)
('ABBV', 7325429)
('AGN', 4170581)
('ABC', 3001798)
('ANTM', 1968246)
...
Depending on the size of a batch full sort can be prohibitively expensive. In that case you can take top and parallelize:
sorted_ = parsed.transform(lambda rdd: rdd.ctx.parallelize(rdd.top(5)))
or even reduceByKey:
from operator import itemgetter
import heapq
key = itemgetter(1)
def create_combiner(key=lambda x: x):
def _(x):
return [(key(x), x)]
return _
def merge_value(n=5, key=lambda x: x):
def _(acc, x):
heapq.heappush(acc, (key(x), x))
return heapq.nlargest(n, acc) if len(acc) > n else acc
return _
def merge_combiners(n=5):
def _(acc1, acc2):
merged = list(heapq.merge(acc1, acc2))
return heapq.nlargest(n, merged) if len(merged) > n else merged
return _
(parsed
.map(lambda x: (None, x))
.combineByKey(
create_combiner(key=key), merge_value(key=key), merge_combiners())
.flatMap(lambda x: x[1]))

PySpark map not working

I am new to Apache Spark and a simple map function implemented as
from pyspark import SparkContext
sc = SparkContext( 'local', 'pyspark')
f = open("Tweets_tokenised.txt")
tokenised_tweets = f.readlines()
f = open("positive.txt")
pos_words=f.readlines()
f = open("negative.txt")
neg_words=f.readlines()
def sentiment(line):
global pos_words
global neg_words
pos = 0
neg = 0
for word in line.split():
if word in pos_words:
pos=pos+1
if word in neg_words:
neg=neg+1
if(pos > neg):
return 1
else:
return 0
dist_tweets=sc.textFile("Tweets_tokenised.txt").map(sentiment)
#(lambda line: sentiment(line))
dist_tweets.saveAsTextFile("RDD.txt")
Basically I am reading a file(containing tokenised and stemmed tweets) and then doing a simple positive-negative word count on it within the map function.(3rd line from the end)But RDD.txt has nothing in it.The function sentiment is not being called at all.
Can someone point out the error
You can't change the value of a global variable inside a map transformation in Apache Spark to achieve this you need an Accumulator, however even using using them I think that is not the correct approach.
In your case if your pos_words and neg_words are not so big, you could define them as Broadcast lists, and then count by sentiment.
Something like:
pos = sc.broadcast(["good", "gold", "silver"])
neg = sc.broadcast(["evil", "currency", "fiat"])
# I will suppose that every record is a different tweet and are stored in tuples.
tweets = sc.parallelize([("banking", "is", "evil"), ("gold", "is", "good")])
(tweets
.flatMap(lambda x: x)
.map(lambda x: (1 if x in pos.value else -1 if x in neg.value else 0, 1))
.reduceByKey(lambda a, b: a + b).take(3))
# notice that I count neutral words.
# output -> [(0, 3), (1, 2), (-1, 1)]
Note, you can check the example right here.
PD: If your idea was to count the positive and negative words per message, the approach vary very slightly.

Resources