break one DF row to multiple row in another DF - apache-spark

I am looking to convert one DF into another.
the difference is 1 row in DF1 may be 3 rows in DF2
example DF1
cust_id | email_id_1 | email_id_2 | email_id_3 |
1 |one_1#m.com | one_2#m.com| one_3#m.com|
then DF2 will be like
cust_id | email_id |
1 |one_1#m.com |
1 |one_2#m.com |
1 |one_3#m.com |
I have written below code , which is giving me error AttributeError: 'str' object has no attribute 'cast'
# Create a schema for the dataframe
dfSchema = StructType([
StructField('CUST_ID', LongType()),
StructField('EMAIL_ADDRESS', StringType())
])
dfData = []
for row in initialCustEmailDetailsDF.rdd.collect():
if row["email_address_1"]!="":
temp1 = [row["cust_id"].cast(LongType()),row["email_address_1"]]
# error : AttributeError: 'str' object has no attribute 'cast'
dfData.append(temp1)
if row["email_address_2"]!="":
temp2 = [row["cust_id"].cast(LongType()),row["email_address_2"]]
dfData.append(temp2)
if row["email_address_3"]!="":
temp3 = [row["cust_id"].cast(LongType())row["email_address_3"]]
dfData.append(temp3)
# Convert list to RDD
rdd = spark.sparkContext.parallelize(dfData)
# Create data frame
df = spark.createDataFrame(rdd, dfSchema)
df.show()

You may be looking for explode_outer:
df.show()
+-------+-----------+-----------+-----------+
|cust_id| email_id_1| email_id_2| email_id_3|
+-------+-----------+-----------+-----------+
| 1|one_1#m.com|one_2#m.com| null|
| 2|one_1#m.com| null|one_3#m.com|
| 3|one_1#m.com|one_2#m.com|one_3#m.com|
+-------+-----------+-----------+-----------+
import pyspark.sql.functions as F
df2 = df.select(
'cust_id',
F.explode_outer(
F.array('email_id_1', 'email_id_2', 'email_id_3')
).alias('email_id')
)
df2.show()
+-------+-----------+
|cust_id| email_id|
+-------+-----------+
| 1|one_1#m.com|
| 1|one_2#m.com|
| 1| null|
| 2|one_1#m.com|
| 2| null|
| 2|one_3#m.com|
| 3|one_1#m.com|
| 3|one_2#m.com|
| 3|one_3#m.com|
+-------+-----------+

Related

Merge two columns in a single DataFrame and count the occurrences using PySpark

I've two columns in my DataFrame name1 and name2.
I want to join them and count the occurrence (without Null values!).
df = spark.createDataFrame([
["Luc Krier","Jeanny Thorn"],
["Jeanny Thorn","Ben Weller"],
[ "Teddy E Beecher","Luc Krier"],
["Philippe Schauss","Jeanny Thorn"],
["Meindert I Tholen","Liam Muller"],
["Meindert I Tholen",""]
]).toDF("name1", "name2")
Desired result:
+------------------------------+
|name |Occurrence |
+------------------------------+
|Luc Krier |2 |
|Jeanny Thorn |3 |
|Teddy E Beecher |1 |
|Philippe Schauss |1 |
|Meindert I Tholen |2 |
|Liam Muller |1 |
|Ben Weller |1 |
+------------------------------+
How can I achieve this?
You can use explode with array fuction to merge the columns into one then simply group by and count, like this :
from pyspark.sql.functions import col, array, explode, count
df.select(explode(array("name1", "name2")).alias("name")) \
.filter("nullif(name, '') is not null") \
.groupBy("name") \
.agg(count("*").alias("Occurrence")) \
.show()
#+-----------------+----------+
#| name|Occurrence|
#+-----------------+----------+
#|Meindert I Tholen| 2|
#| Jeanny Thorn| 3|
#| Luc Krier| 2|
#| Teddy E Beecher| 1|
#|Philippe Schauss| 1|
#| Ben Weller| 1|
#| Liam Muller| 1|
#+-----------------+----------+
Another way is to select each column, union then group by and count:
df.select(col("name1").alias("name")).union(df.select(col("name2").alias("name"))) \
.filter("nullif(name, '') is not null")\
.groupBy("name") \
.agg(count("name").alias("Occurrence")) \
.show()
Many fancy answers out there, but the easiest solution should be to do a union and then aggregate the count:
df2 = (df.select('name1')
.union(df.select('name2'))
.filter("name1 != ''")
.groupBy('name1')
.count()
.toDF('name', 'Occurrence')
)
df2.show()
+-----------------+----------+
| name|Occurrence|
+-----------------+----------+
|Meindert I Tholen| 2|
| Jeanny Thorn| 3|
| Luc Krier| 2|
| Teddy E Beecher| 1|
|Philippe Schauss| 1|
| Ben Weller| 1|
| Liam Muller| 1|
+-----------------+----------+
There are better ways to do it. One naive way of doing it is as follows
from collections import Counter
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("OccurenceCount").getOrCreate()
df = spark.createDataFrame([
["Luc Krier","Jeanny Thorn"],
["Jeanny Thorn","Ben Weller"],
[ "Teddy E Beecher","Luc Krier"],
["Philippe Schauss","Jeanny Thorn"],
["Meindert I Tholen","Liam Muller"],
["Meindert I Tholen",""]
]).toDF("name1", "name2")
counter_dict = dict(Counter(df.select("name1", "name2").rdd.flatMap(lambda x: x).collect()))
counter_list = list(map(list, counter_dict.items()))
frequency_df = spark.createDataFrame(counter_list, ["name", "Occurrence"])
frequency_df.show()
Output:
+-----------------+----------+
| name|Occurrence|
+-----------------+----------+
| | 1|
| Liam Muller| 1|
| Teddy E Beecher| 1|
| Ben Weller| 1|
| Jeanny Thorn| 3|
| Luc Krier| 2|
|Philippe Schauss| 1|
|Meindert I Tholen| 2|
+-----------------+----------+
Does this work?
# Groupby & count both dataframes individually to reduce size.
df_name1 = (df.groupby(['name1']).count()
.withColumnRenamed('name1', 'name')
.withColumnRenamed('count', 'count1'))
df_name2 = (df.groupby(['name2']).count()
.withColumnRenamed('name2', 'name')
.withColumnRenamed('count', 'count2'))
# Join the two dataframes containing frequency counts
# Any null value in the 'count' column can be correctly interpreted as zero.
df_count = (df_name1.join(df_name2, on=['name'], how='outer')
.fillna(0, subset=['count1', 'count2']))
# Sum the two counts and drop the useless columns
df_count = (df_count.withColumn('count', df_count['count1'] + df_count['count2'])
.drop('count1').drop('count2').dropna(subset=['name']))
# (Optional) While any rows with a null name have been removed, rows with an
# empty string ("") for a name are still there. We can drop the empty name
# rows like this.
df_count = df_count[df_count['name'] != '']
df_count.show()
# +-----------------+-----+
# | name|count|
# +-----------------+-----+
# |Meindert I Tholen| 2|
# | Jeanny Thorn| 3|
# | Luc Krier| 2|
# | Teddy E Beecher| 1|
# |Philippe Schauss| 1|
# | Ben Weller| 1|
# | Liam Muller| 1|
# +-----------------+-----+
You can get the required output as follows in scala :
import org.apache.spark.sql.functions._
val df = Seq(
("Luc Krier","Jeanny Thorn"),
("Jeanny Thorn","Ben Weller"),
( "Teddy E Beecher","Luc Krier"),
("Philippe Schauss","Jeanny Thorn"),
("Meindert I Tholen","Liam Muller"),
("Meindert I Tholen","")
).toDF("name1", "name2")
val df1 = df.filter($"name1".isNotNull).filter($"name1" !==
"").groupBy("name1").agg(count("name1").as("count1"))
val df2 = df.filter($"name2".isNotNull).filter($"name2" !==
"").groupBy("name2").agg(count("name2").as("count2"))
val newdf = df1.join(df2, $"name1" === $"name2","outer").withColumn("count1",
when($"count1".isNull,0).otherwise($"count1")).withColumn("count2",
when($"count2".isNull,0).otherwise($"count2")).withColumn("Count",$"count1" +
$"count2")
val finalDF =newdf.withColumn("name",when($"name1".isNull,$"name2")
.when($"name2".isNull,$"name1").otherwise($"name1")).select("name","Count")
display(finalDF)
You can see the final output as image below :

Parsing through rows and isolating student records from Spark Dataframe

My student database has multiple records for each student in the table Student.
I am reading the data into a Spark dataframe and then iterate through a Spark Dataframe, isolate records for each student and do some processing for each student records.
My code so far:
from pyspark.sql import SparkSession
spark_session = SparkSession \
.builder \
.appName("app") \
.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.2") \
.getOrCreate()
class_3A = spark_session.sql("SQL")
for row in class_3A:
#for each student
#Print Name, Age and Subject Marks
How do I do this?
Another approach would be to use SparkSQL
>>> df = spark.createDataFrame([('Ankit',25),('Jalfaizy',22),('Suresh',20),('Bala',26)],['name','age'])
>>> df.show()
+--------+---+
| name|age|
+--------+---+
| Ankit| 25|
|Jalfaizy| 22|
| Suresh| 20|
| Bala| 26|
+--------+---+
>>> df.where('age > 20').show()
+--------+---+
| name|age|
+--------+---+
| Ankit| 25|
|Jalfaizy| 22|
| Bala| 26|
+--------+---+
>>> from pyspark.sql.functions import *
>>> df.select('name', col('age') + 100).show()
+--------+-----------+
| name|(age + 100)|
+--------+-----------+
| Ankit| 125|
|Jalfaizy| 122|
| Suresh| 120|
| Bala| 126|
+--------+-----------+
Imperative approach(in addition to Bala's SQL approach):
class_3A = spark_session.sql("SQL")
def process_student(student_row):
# Do Something with student_row
return processed_student_row
#"isolate records for each student"
# Each student record will be passed to process_student function for processing.
# Results will be accumulated to a new DF - result_df
result_df = class_3A.map(process_student)
# If you don't care about results and just want to do some processing:
class_3A.foreach(process_student)
You can loop through each records in a dataframe and access them with the column names
from pyspark.sql import Row
from pyspark.sql.functions import *
l = [('Ankit',25),('Jalfaizy',22),('Suresh',20),('Bala',26)]
rdd = sc.parallelize(l)
people = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
schemaPeople = spark.createDataFrame(people)
schemaPeople.show(10, False)
for row in schemaPeople.rdd.collect():
print("Hi " + str(row.name) + " your age is : " + str(row.age) )
This will produce an output as below
+---+--------+
|age|name |
+---+--------+
|25 |Ankit |
|22 |Jalfaizy|
|20 |Suresh |
|26 |Bala |
+---+--------+
Hi Ankit your age is : 25
Hi Jalfaizy your age is : 22
Hi Suresh your age is : 20
Hi Bala your age is : 26
So you can do your processing or some logic that you need to perform on each record of your dataframe.
Not sure if i understand the question right but if you want to do operation on
rows based on any column you can do that using dataframe functions . Example :
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql import Window
sc = SparkSession.builder.appName("example").\
config("spark.driver.memory","1g").\
config("spark.executor.cores",2).\
config("spark.max.cores",4).getOrCreate()
df1 = sc.read.format("csv").option("header","true").load("test.csv")
w = Window.partitionBy("student_id")
df2 = df1.groupBy("student_id").agg(f.sum(df1["marks"]).alias("total"))
df3 = df1.withColumn("max_marks_inanysub",f.max(df1["marks"]).over(w))
df3 = df3.filter(df3["marks"] == df3["max_marks_inanysub"])
df1.show()
df3.show()
sample data
student_id,subject,marks
1,maths,3
1,science,6
2,maths,4
2,science,7
output
+----------+-------+-----+
|student_id|subject|marks|
+----------+-------+-----+
| 1| maths| 3|
| 1|science| 6|
| 2| maths| 4|
| 2|science| 7|
+----------+-------+-----+
+----------+-------+-----+------------------+
|student_id|subject|marks|max_marks_inanysub|
+----------+-------+-----+------------------+
| 1|science| 6| 6|
| 2|science| 7| 7|
+----------+-------+-----+------------------+

Spark filter multiple group of rows to a single row

I am trying to acheive the following,
Lets say I have a dataframe with the following columns
id | name | alias
-------------------
1 | abc | short
1 | abc | ailas-long-1
1 | abc | another-long-alias
2 | xyz | short_alias
2 | xyz | same_length
3 | def | alias_1
I want to groupby id and name and select the shorter alias,
The output I am expecting is
id | name | alias
-------------------
1 | abc | short
2 | xyz | short_alias
3 | def | alias_1
I can achevie this using window and row_number, is there anyother efficient method to get the same result. In general, the thrid column filter condition can be anything in this case its the length of the field.
Any help would be much appreciated.
Thank you.
All you need to do is use length inbuilt function and use that in window function as
from pyspark.sql import functions as f
from pyspark.sql import Window
windowSpec = Window.partitionBy('id', 'name').orderBy('length')
df.withColumn('length', f.length('alias'))\
.withColumn('length', f.row_number().over(windowSpec))\
.filter(f.col('length') == 1)\
.drop('length')\
.show(truncate=False)
which should give you
+---+----+-----------+
|id |name|alias |
+---+----+-----------+
|3 |def |alias_1 |
|1 |abc |short |
|2 |xyz |short_alias|
+---+----+-----------+
A solution without window (Not very pretty..) and the easiest, in my opinion, rdd solution:
from pyspark.sql import functions as F
from pyspark.sql import HiveContext
hiveCtx = HiveContext(sc)
rdd = sc.parallelize([(1 , "abc" , "short-alias"),
(1 , "abc" , "short"),
(1 , "abc" , "ailas-long-1"),
(1 , "abc" , "another-long-alias"),
(2 , "xyz" , "same_length"),
(2 , "xyz" , "same_length1"),
(3 , "def" , "short_alias") ])
df = hiveCtx.createDataFrame(\
rdd, ["id", "name", "alias"])
len_df = df.groupBy(["id", "name"]).agg(F.min(F.length("alias")).alias("alias_len"))
df = df.withColumn("alias_len", F.length("alias"))
cond = ["alias_len", "id", "name"]
df.join(len_df, cond).show()
print rdd.map(lambda x: ((x[0], x[1]), x[2]))\
.reduceByKey(lambda x,y: x if len(x) < len(y) else y ).collect()
Output:
+---------+---+----+-----------+
|alias_len| id|name| alias|
+---------+---+----+-----------+
| 11| 3| def|short_alias|
| 11| 2| xyz|same_length|
| 5| 1| abc| short|
+---------+---+----+-----------+
[((2, 'xyz'), 'same_length'), ((3, 'def'), 'short_alias'), ((1, 'abc'), 'short')]

PySpark: Compare array values in one dataFrame with array values in another dataFrame to get the intersection

I have the following two DataFrames:
l1 = [(['hello','world'],), (['stack','overflow'],), (['hello', 'alice'],), (['sample', 'text'],)]
df1 = spark.createDataFrame(l1)
l2 = [(['big','world'],), (['sample','overflow', 'alice', 'text', 'bob'],), (['hello', 'sample'],)]
df2 = spark.createDataFrame(l2)
df1:
["hello","world"]
["stack","overflow"]
["hello","alice"]
["sample","text"]
df2:
["big","world"]
["sample","overflow","alice","text","bob"]
["hello", "sample"]
For every row in df1, I want to calculate the number of times all the words in the array occur in df2.
For example, the first row in df1 is ["hello","world"]. Now, I want to check df2 for the intersection of ["hello","world"] with every row in df2.
| ARRAY | INTERSECTION | LEN(INTERSECTION)|
|["big","world"] |["world"] | 1 |
|["sample","overflow","alice","text","bob"] |[] | 0 |
|["hello","sample"] |["hello"] | 1 |
Now, I want to return the sum(len(interesection)). Ultimately I want the resulting df1 to look like this:
df1 result:
ARRAY INTERSECTION_TOTAL
| ["hello","world"] | 2 |
| ["stack","overflow"] | 1 |
| ["hello","alice"] | 2 |
| ["sample","text"] | 3 |
How do I solve this?
I'd focus on avoiding Cartesian product first. I'd try to explode and join
from pyspark.sql.functions import explode, monotonically_increasing_id
df1_ = (df1.toDF("words")
.withColumn("id_1", monotonically_increasing_id())
.select("*", explode("words").alias("word")))
df2_ = (df2.toDF("words")
.withColumn("id_2", monotonically_increasing_id())
.select("id_2", explode("words").alias("word")))
(df1_.join(df2_, "word").groupBy("id_1", "id_2", "words").count()
.groupBy("id_1", "words").sum("count").drop("id_1").show())
+-----------------+----------+
| words|sum(count)|
+-----------------+----------+
| [hello, alice]| 2|
| [sample, text]| 3|
|[stack, overflow]| 1|
| [hello, world]| 2|
+-----------------+----------+
If intermediate values are not needed it could be simplified to:
df1_.join(df2_, "word").groupBy("words").count().show()
+-----------------+-----+
| words|count|
+-----------------+-----+
| [hello, alice]| 2|
| [sample, text]| 3|
|[stack, overflow]| 1|
| [hello, world]| 2|
+-----------------+-----+
and you could omit adding ids.

combining multiple rows in Spark dataframe column based on condition

I am trying to combine multiple rows in a spark dataframe based on a condition:
This is the dataframe I have(df):
|username | qid | row_no | text |
---------------------------------
| a | 1 | 1 | this |
| a | 1 | 2 | is |
| d | 2 | 1 | the |
| a | 1 | 3 | text |
| d | 2 | 2 | ball |
I want it to look like this
|username | qid | row_no | text |
---------------------------------------
| a | 1 | 1,2,3 | This is text|
| b | 2 | 1,2 | The ball |
I am using spark 1.5.2 it does not have collect_list function
collect_list showed up only in 1.6.
I'd go through the underlying RDD. Here's how:
data_df.show()
+--------+---+------+----+
|username|qid|row_no|text|
+--------+---+------+----+
| d| 2| 2|ball|
| a| 1| 1|this|
| a| 1| 3|text|
| a| 1| 2| is|
| d| 2| 1| the|
+--------+---+------+----+
Then this
reduced = data_df\
.rdd\
.map(lambda row: ((row[0], row[1]), [(row[2], row[3])]))\
.reduceByKey(lambda x,y: x+y)\
.map(lambda row: (row[0], sorted(row[1], key=lambda text: text[0]))) \
.map(lambda row: (
row[0][0],
row[0][1],
','.join([str(e[0]) for e in row[1]]),
' '.join([str(e[1]) for e in row[1]])
)
)
schema_red = typ.StructType([
typ.StructField('username', typ.StringType(), False),
typ.StructField('qid', typ.IntegerType(), False),
typ.StructField('row_no', typ.StringType(), False),
typ.StructField('text', typ.StringType(), False)
])
df_red = sqlContext.createDataFrame(reduced, schema_red)
df_red.show()
The above produced the following:
+--------+---+------+------------+
|username|qid|row_no| text|
+--------+---+------+------------+
| d| 2| 1,2| the ball|
| a| 1| 1,2,3|this is text|
+--------+---+------+------------+
In pandas
df4 = pd.DataFrame([
['a', 1, 1, 'this'],
['a', 1, 2, 'is'],
['d', 2, 1, 'the'],
['a', 1, 3, 'text'],
['d', 2, 2, 'ball']
], columns=['username', 'qid', 'row_no', 'text'])
df_groupped=df4.sort_values(by=['qid', 'row_no']).groupby(['username', 'qid'])
df3 = pd.DataFrame()
df3['row_no'] = df_groupped.apply(lambda row: ','.join([str(e) for e in row['row_no']]))
df3['text'] = df_groupped.apply(lambda row: ' '.join(row['text']))
df3 = df3.reset_index()
You can apply groupBy on username and qid column then follow by agg() method you can use collect_list() method like this
import pyspark.sql.functions as func
then you will have collect_list()or some other important functions
for detail abput groupBy and agg you can follow this URL.
Hope this solves your problem
Thanks

Resources