Group and merge RDD pair keys and values - apache-spark

RDD_Input = [(('377', '80'), ('1', '4')), (('377', '510'), ('1', '5')), (('377', '79'), ('1', '4')), (('377', '791'), ('1', '1')), (('377', '511'), ('1', '4')), (('377', '433'), ('1', '3')), (('377', '687'), ('1', '1')), (('377', '456'), ('1', '1')), (('377', '399'), ('1', '4')), (('377', '96'), ('1', '5')), (('377', '780'), ('1', '1')), (('377', '683'), ('1', '1')), (('377', '403'), ('1', '5')), (('377', '999'), ('1', '4')), (('377', '502'), ('1', '4')), (('377', '435'), ('1', '5')), (('377', '550'), ('1', '5')), (('377', '948'), ('1', '1')), (('377', '393'), ('1', '4')), (('377', '648'), ('1', '4'))]
The input RDD is in key-value pairs ((movie1, movie2), (rating1, rating2)).
How do I transform the RDD into((movie1, movie2), (rating1, rating2), (rating3, rating4), (rating5, rating6), ...))?
Expected result example: (('377', '399'), ('1', '4'), ('1', '4'))
('377', '399') being the key and the following tuple index is appended based on the same key.
The requirement is to use purely RDD API.

It seems, you want to first groupByKey putting values into a list and then just map.
data = [(('a', 'b'), ('1', '4')), (('a', 'b'), ('3', '5')), (('c', 'd'), ('2', '2'))]
rdd = sc.parallelize(data)
rdd = rdd.groupByKey().mapValues(list)
rdd = rdd.map(lambda x: (x[0], *x[1]))
print(rdd.collect())
# [(('c', 'd'), ('2', '2')), (('a', 'b'), ('1', '4'), ('3', '5'))]

Related

GraphFrames: Merge edge nodes with similar column values

tl;dr: How do you simplify a graph, removing edge nodes with identical name values?
I have a graph defined as follows:
import graphframes
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
vertices = spark.createDataFrame([
('1', 'foo', '1'),
('2', 'bar', '2'),
('3', 'bar', '3'),
('4', 'bar', '5'),
('5', 'baz', '9'),
('6', 'blah', '1'),
('7', 'blah', '2'),
('8', 'blah', '3')
], ['id', 'name', 'value'])
edges = spark.createDataFrame([
('1', '2'),
('1', '3'),
('1', '4'),
('1', '5'),
('5', '6'),
('5', '7'),
('5', '8')
], ['src', 'dst'])
f = graphframes.GraphFrame(vertices, edges)
Which produces a graph that looks like this (where the numbers represent the vertex ID):
Starting from vertex ID equal to 1, I'd like to simplify the graph. Such that nodes with similar name values are coalesced into a single node. A resulting graph would look something
like this:
Notice how we only have one foo (ID 1), one bar (ID 2), one baz (ID 5) and one blah (ID 6). The value of the vertex is irrelevant, and just to show that each vertex is unique.
I attempted to implement a solution, however it is hacky, extremely inefficient and I'm certain there is a better way (I also don't think it works):
f = graphframes.GraphFrame(vertices, edges)
# Get the out degrees for our nodes. Nodes that do not appear in
# this dataframe have zero out degrees.
outs = f.outDegrees
# Merge this with our nodes.
vertices = f.vertices
vertices = f.vertices.join(outs, outs.id == vertices.id, 'left').select(vertices.id, 'name', 'value', 'outDegree')
vertices.show()
# Create a new graph with our out degree nodes.
f = graphframes.GraphFrame(vertices, edges)
# Find paths to all edge vertices from our vertex ID = 1
# Can we make this one operation instead of two??? What if we have more than two hops?
one_hop = f.find('(a)-[e]->(b)').filter('b.outDegree is null').filter('a.id == "1"')
one_hop.show()
two_hop = f.find('(a)-[e1]->(b); (b)-[e2]->(c)').filter('c.outDegree is null').filter('a.id == "1"')
two_hop.show()
# Super ugly, but union the vertices from the `one_hop` and `two_hop` above, and unique
# on the name.
vertices = one_hop.select('a.*').union(one_hop.select('b.*'))
vertices = vertices.union(two_hop.select('a.*').union(two_hop.select('b.*').union(two_hop.select('c.*'))))
vertices = vertices.dropDuplicates(['name'])
vertices.show()
# Do the same for the edges
edges = two_hop.select('e1.*').union(two_hop.select('e2.*')).union(one_hop.select('e.*')).distinct()
# We need to ensure that we have the respective nodes from our edges. We do this by
# Ensuring the referenced vertex ID is in our `vertices` in both the `src` and the `dst`
# columns - This does NOT seem to work as I'd expect!
edges = edges.join(vertices, vertices.id == edges.src, "left").select("src", "dst")
edges = edges.join(vertices, vertices.id == edges.dst, "left").select("src", "dst")
edges.show()
Is there an easier way to remove nodes (and their corresponding edges) so that edge nodes are uniqued on their name?
Why don't you simply treat the name column as new id?
import graphframes
vertices = spark.createDataFrame([
('1', 'foo', '1'),
('2', 'bar', '2'),
('3', 'bar', '3'),
('4', 'bar', '5'),
('5', 'baz', '9'),
('6', 'blah', '1'),
('7', 'blah', '2'),
('8', 'blah', '3')
], ['id', 'name', 'value'])
edges = spark.createDataFrame([
('1', '2'),
('1', '3'),
('1', '4'),
('1', '5'),
('5', '6'),
('5', '7'),
('5', '8')
], ['src', 'dst'])
#create a dataframe with only one column
new_vertices = vertices.select(vertices.name.alias('id')).distinct()
#replace the src ids with the name column
new_edges = edges.join(vertices, edges.src == vertices.id, 'left')
new_edges = new_edges.select(new_edges.dst, new_edges.name.alias('src'))
#replace the dst ids with the name column
new_edges = new_edges.join(vertices, new_edges.dst == vertices.id, 'left')
new_edges = new_edges.select(new_edges.src, new_edges.name.alias('dst'))
#drop duplicate edges
new_edges = new_edges.dropDuplicates(['src', 'dst'])
new_edges.show()
new_vertices.show()
f = graphframes.GraphFrame(new_vertices, new_edges)
Output:
+---+----+
|src| dst|
+---+----+
|foo| baz|
|foo| bar|
|baz|blah|
+---+----+
+----+
| id|
+----+
|blah|
| bar|
| foo|
| baz|
+----+

Remove sequential duplicate word separated by delimiter

I am trying to remove sequential duplicate separated by delimiter '>' from journey column and also aggregate values under column uu and conv. I've tried
INPUT
a=[['journey', 'uu', 'convs'],
['Ct', '10', '2'],
['Ct>Ct', '100', '3'],
['Ct>Pt>Ct', '200', '10'],
['Ct>Pt>Ct>Ct', '40', '5'],
['Ct>Pt>Bu', '1000', '8']]
OUTPUT
a=[['journey', 'uu', 'convs'],
['Ct', '110', '5'],
['Ct>Pt>Ct', '240', '15'],
['Ct>Pt>Bu', '1000', '8']]
I tried below to split but it didn't work
a='>'.join(set(a.split()))
You need to split your string by > and then you could use groupby to eliminate duplicate items in your string. For example:
x = ['Ct>Pt>Ct>Ct', '40', '5']
print(">".join([i for i, _ in groupby(x[0].split(">"))]))
# 'Ct>Pt>Ct'
You could use this as a lambda function in another groupby to aggregate the lists. Then sum each element of the same index by using zip. Check it out:
a=[['journey', 'uu', 'convs'],
['Ct', '10', '2'],
['Ct>Ct', '100', '3'],
['Ct>Pt>Ct', '200', '10'],
['Ct>Pt>Ct>Ct', '40', '5'],
['Ct>Pt>Bu', '1000', '8']]
from itertools import groupby
result = [a[0]] # Add header
groups = groupby(
a[1:],
key=lambda x: ">".join([i for i, _ in groupby(x[0].split(">"))])
)
# groups:
# ['Ct, '[['Ct', '10', '2'], ['Ct>Ct', '100', '3']]]
# ['Ct>Pt>Ct', [['Ct>Pt>Ct', '200', '10'], ['Ct>Pt>Ct>Ct', '40', '5']]]
# ['Ct>Pt>Bu', [['Ct>Pt>Bu', '1000', '8']]]
for key, items in groups:
row = [key]
for i in zip(*items):
if i[0].isdigit():
row.append(str(sum(map(int, i))))
result.append(row)
print(result)
Prints:
[['journey', 'uu', 'convs'],
['Ct', '110', '5'],
['Ct>Pt>Ct', '240', '15'],
['Ct>Pt>Bu', '1000', '8']]

How to normalize the distribution in the tuples?

I tried to do some normalization in my code and I have a list with inner-list:
a = [[ ('1', 0.03),
('2', 0.03),
('3', 0.06)]
[ ('4', 0.03),
('5', 0.06),
('6', 0.06)]
[ ('7', 0.07),
('8', 0.014),
('9', 0.07)]
]
I tried to normalized the distribution in the tuples to get list b
b = [[ ('1', 0.25),
('2', 0.25),
('3', 0.50)]
[ ('4', 0.20),
('5', 0.40),
('6', 0.40)]
[ ('7', 0.25),
('8', 0.50),
('9', 0.25)]
]
And I tried:
for i in a:
for n, (ee,ww) in enumerate(i):
i[n] = (ee,ww/sum(ww))
But it failed.
How to get b in python?
a = [[ ('1', 0.03),
('2', 0.03),
('3', 0.06)],
[ ('4', 0.03),
('5', 0.06),
('6', 0.06)],
[ ('7', 0.07),
('8', 0.14),
('9', 0.07)]
]
for i in a:
s = sum(v[1] for v in i)
i[:] = [(v[0], v[1] / s) for v in i]
from pprint import pprint
pprint(a)
Prints:
[[('1', 0.25), ('2', 0.25), ('3', 0.5)],
[('4', 0.2), ('5', 0.4), ('6', 0.4)],
[('7', 0.25), ('8', 0.5), ('9', 0.25)]]
Note:
i[:] = [(v[0], v[1] / s) for v in i] replaces all values in list i with new values from the list comprehension.

How to fetch data from ImmutableMultiDict in python3

Hi i am very much new to python programming. Currently i am developing a small application using flask for users to submit data. The form is generated dynamically.I am not using WTForms
Here is the HTML part which is generating the form
{% if message %}
{% for data in message %}
<tr>
<th><input type="text" name="matchnumber" value="{{data.get('matchno')}}" readonly></th>
<th>{{data.get('team1')}}</th>
<th>vs</th>
<th>{{data.get('team2')}}</th>
<th><select name="winner">
<option value="{{data.get('team1')}}">{{data.get('team1')}}</option>
<option value="{{data.get('team2')}}">{{data.get('team2')}}</option>
</select></th>
<th><input type="number" placeholder="Point" name="point" value="{{request.form.point }}" required min="500" max="1000"></th>
</tr>
{% endfor %}
{% endif %}
<table>
the form is getting generated as expected. It will have minimum of 8 columns, it can also be more based on the response it receives from the data base.
But the real problem is arising when I am trying to receive the form data.The data I receive from the form will again be saved in the database
This is the python part from where I am trying to handle the response
result=request.form
result2=result.to_dict(flat=False)
for key,value in result2.items():
print(key,value)
the problem is that the first line is giving a ImmutableMultiDict data and I am not able to iterate through the same and even after converting it to a proper dictionary I am not able to extract each field properly
the result variable is having the output as
ImmutableMultiDict([('matchnumber', '1'), ('matchnumber', '2'), ('matchnumber', '3'), ('matchnumber', '4'), ('matchnumber', '5'), ('matchnumber', '6'), ('matchnumber', '7'), ('matchnumber', '8'), ('winner', 'Russia'), ('winner', 'Egypt'), ('winner', 'Morocco'), ('winner', 'Portugal'), ('winner', 'France'), ('winner', 'Argentina'), ('winner', 'Peru'), ('winner', 'Croatia'), ('point', '800'), ('point', '800'), ('point', '800'), ('point', '800'), ('point', '800'), ('point', '800'), ('point', '800'), ('point', '800')])
after converting it into a dictionary the variable result2 has this as output
{'matchnumber': ['1', '2', '3', '4', '5', '6', '7', '8'], 'winner': ['Russia', 'Egypt', 'Morocco', 'Portugal', 'France', 'Argentina', 'Peru', 'Croatia'], 'point': ['800', '800', '800', '800', '800', '800', '800', '800']}
when i am iterating over the result2 items this is the output that is getting generated
matchnumber ['1', '2', '3', '4', '5', '6', '7', '8']
winner ['Russia', 'Egypt', 'Morocco', 'Portugal', 'France', 'Argentina', 'Peru', 'Croatia']
point ['800', '800', '800', '800', '800', '800', '800', '800']
the exact issue that i am facing is i am not able to iterate over the list simultaneously, i tried googleing and find solutions as to how to fetch exactdata from an ImmutableMultiDic but i didnt get any good result
Please any one can suggest me a better way to get the data out of the form.The entire form is going to be dynamic, as it will change based on the date it will keep updating the match list. I am not using any ORM i am using only raw SQL for processing the data
Thanks in advance
Assuming you want the data in this format:
[{'matchnumber': '1', 'winner': 'Russia', 'point': '800'}, {'matchnumber': '2', 'winner': 'Egypt', 'point': '800'}, ...]
You can try
output = []
for i in range(len(result2['matchnumber'])):
di = {}
for key in result2.keys():
di[key] = result2[key][i]
output.append(di)
print(output)
[{'matchnumber': '1', 'winner': 'Russia', 'point': '800'}, {'matchnumber': '2', 'winner': 'Egypt', 'point': '800'}, {'matchnumber': '3', 'winner': 'Morocco', 'point': '800'}, {'matchnumber': '4', 'winner': 'Portugal', 'point': '800'}, {'matchnumber': '5', 'winner': 'France', 'point': '800'}, {'matchnumber': '6', 'winner': 'Argentina', 'point': '800'}, {'matchnumber': '7', 'winner': 'Peru', 'point': '800'}, {'matchnumber': '8', 'winner': 'Croatia', 'point': '800'}]
Then you can iterate over output and process each dictionary
Unfortunately I can't comment yet, but I wrote the solution by Shivam Singh a bit more compact:
result2 = request.form.to_dict(flat=False)
datamap = [ { key : value[i] for key, value in result.items() } for i in range(len(result['matchnumber'])) ]

How to make list of tuple from list of list?

How do I convert this list of lists:
[['0', '1'], ['0', '2'], ['0', '3'], ['1', '4'], ['1', '6'], ['1', '7'], ['1', '9'], ['2', '3'], ['2', '6'], ['2', '8'], ['2', '9']]
To this list of tuples:
[(0, [1, 2, 3]), (1, [0, 4, 6, 7, 9]), (2, [0, 3, 6, 8, 9])]
I am unsure how to implement this next step? (I can't use dictionaries,
sets, deque, bisect module. You can though, and in fact should, use .sort or sorted functions.)
Here is my attempt:
network= [['10'], ['0 1'], ['0 2'], ['0 3'], ['1 4'], ['1 6'], ['1 7'], ['1 9'], ['2 3'], ['2 6'], ['2 8'], ['2 9']]
network.remove(network[0])
friends=[]
for i in range(len(network)):
element= (network[i][0]).split(' ')
friends.append(element)
t=len(friends)
s= len(friends[0])
lst=[]
for i in range(t):
a= (friends[i][0])
if a not in lst:
lst.append(int(a))
for i in range(t):
if a == friends[i][0]:
b=(friends[i][1])
lst.append([b])
print(tuple(lst))
It outputs:
(0, ['1'], ['2'], ['3'], 0, ['1'], ['2'], ['3'], 0, ['1'], ['2'], ['3'], 1, ['4'], ['6'], ['7'], ['9'], 1, ['4'], ['6'], ['7'], ['9'], 1, ['4'], ['6'], ['7'], ['9'], 1, ['4'], ['6'], ['7'], ['9'], 2, ['3'], ['6'], ['8'], ['9'], 2, ['3'], ['6'], ['8'], ['9'], 2, ['3'], ['6'], ['8'], ['9'], 2, ['3'], ['6'], ['8'], ['9'])
I am very close it seems, not sure what to do??
A simpler method:
l = [['0', '1'], ['0', '2'], ['0', '3'], ['1', '4'], ['1', '6'], ['1', '7'], ['1', '9'], ['2', '3'], ['2', '6'], ['2', '8'], ['2', '9']]
a=set(i[0] for i in l)
b=list( (i,[]) for i in a)
[b[int(i[0])][1].append(i[1]) for i in l]
print(b)
Output:
[('0', ['1', '2', '3']), ('1', ['4', '6', '7', '9']), ('2', ['3', '6', '8', '9'])]
Alternate Answer (without using set)
l = [['0', '1'], ['0', '2'], ['0', '3'], ['1', '4'], ['1', '6'], ['1', '7'], ['1', '9'], ['2', '3'], ['2', '6'], ['2', '8'], ['2', '9']]
a=[]
for i in l:
if i[0] not in a:
a.append(i[0])
b=list( (i,[]) for i in a)
[b[int(i[0])][1].append(i[1]) for i in l]
print(b)
also outputs
[('0', ['1', '2', '3']), ('1', ['4', '6', '7', '9']), ('2', ['3', '6', '8', '9'])]
You can use Pandas:
import pandas as pd
import numpy as np
l = [['0', '1'], ['0', '2'], ['0', '3'], ['1', '4'], ['1', '6'], ['1', '7'], ['1', '9'], ['2', '3'], ['2', '6'], ['2', '8'], ['2', '9']]
df = pd.DataFrame(l, dtype=np.int)
s = df.groupby(0)[1].apply(list)
list(zip(s.index, s))
Output:
[(0, [1, 2, 3]), (1, [4, 6, 7, 9]), (2, [3, 6, 8, 9])]

Resources