How to add column with alternate values in PySpark dataframe? - apache-spark

I have the following sample dataframe
df = spark.createDataFrame([('start','end'), ('start1','end1')] ,["start", "end"])
and I want to explode the values in each row and associate alternating 1-0 values in the generated rows. This way I can identify the start/end entries in each row.
I am able to achieve the desired result this way
from pyspark.sql.window import Window
w = Window().orderBy(lit('A'))
df = (df.withColumn('start_end', fn.array('start', 'end'))
.withColumn('date', fn.explode('start_end'))
.withColumn('row_num', fn.row_number().over(w)))
df = (df.withColumn('is_start', fn.when(fn.col('row_num')%2 == 0, 0).otherwise(1))
.select('date', 'is_start'))
which gives
| date | is_start |
|--------|----------|
| start | 1 |
| end | 0 |
| start1 | 1 |
| end1 | 0 |
but it seems overly complicated for such a simple task.
Is there any better/cleaner way without using UDFs?

You can use pyspark.sql.functions.posexplode along with pyspark.sql.functions.array.
First create an array out of your start and end columns, then explode this with the position:
from pyspark.sql.functions import array, posexplode
df.select(posexplode(array("end", "start")).alias("is_start", "date")).show()
#+--------+------+
#|is_start| date|
#+--------+------+
#| 0| end|
#| 1| start|
#| 0| end1|
#| 1|start1|
#+--------+------+

You can try union:
df = spark.createDataFrame([('start','end'), ('start1','end1')] ,["start", "end"])
df = df.withColumn('startv', F.lit(1))
df = df.withColumn('endv', F.lit(0))
df = df.select(['start', 'startv']).union(df.select(['end', 'endv']))
df.show()
+------+------+
| start|startv|
+------+------+
| start| 1|
|start1| 1|
| end| 0|
| end1| 0|
+------+------+
You can rename the columns and re-order the rows starting here.

I had similar situation in my use case. In my situation i had Huge dataset(~50GB) and doing any self join/heavy transformation was resulting in more memory and unstable execution .
I went one more level down of dataset and used flatmap of rdd. This will use map side transformation and it will be cost effective in terms of shuffle, cpu and memory.
df = spark.createDataFrame([('start','end'), ('start1','end1')] ,["start", "end"])
df.show()
+------+----+
| start| end|
+------+----+
| start| end|
|start1|end1|
+------+----+
final_df = df.rdd.flatMap(lambda row: [(row.start, 1), (row.end, 0)]).toDF(['date', 'is_start'])
final_df.show()
+------+--------+
| date|is_start|
+------+--------+
| start| 1|
| end| 0|
|start1| 1|
| end1| 0|
+------+--------+

Related

Merge two columns in a single DataFrame and count the occurrences using PySpark

I've two columns in my DataFrame name1 and name2.
I want to join them and count the occurrence (without Null values!).
df = spark.createDataFrame([
["Luc Krier","Jeanny Thorn"],
["Jeanny Thorn","Ben Weller"],
[ "Teddy E Beecher","Luc Krier"],
["Philippe Schauss","Jeanny Thorn"],
["Meindert I Tholen","Liam Muller"],
["Meindert I Tholen",""]
]).toDF("name1", "name2")
Desired result:
+------------------------------+
|name |Occurrence |
+------------------------------+
|Luc Krier |2 |
|Jeanny Thorn |3 |
|Teddy E Beecher |1 |
|Philippe Schauss |1 |
|Meindert I Tholen |2 |
|Liam Muller |1 |
|Ben Weller |1 |
+------------------------------+
How can I achieve this?
You can use explode with array fuction to merge the columns into one then simply group by and count, like this :
from pyspark.sql.functions import col, array, explode, count
df.select(explode(array("name1", "name2")).alias("name")) \
.filter("nullif(name, '') is not null") \
.groupBy("name") \
.agg(count("*").alias("Occurrence")) \
.show()
#+-----------------+----------+
#| name|Occurrence|
#+-----------------+----------+
#|Meindert I Tholen| 2|
#| Jeanny Thorn| 3|
#| Luc Krier| 2|
#| Teddy E Beecher| 1|
#|Philippe Schauss| 1|
#| Ben Weller| 1|
#| Liam Muller| 1|
#+-----------------+----------+
Another way is to select each column, union then group by and count:
df.select(col("name1").alias("name")).union(df.select(col("name2").alias("name"))) \
.filter("nullif(name, '') is not null")\
.groupBy("name") \
.agg(count("name").alias("Occurrence")) \
.show()
Many fancy answers out there, but the easiest solution should be to do a union and then aggregate the count:
df2 = (df.select('name1')
.union(df.select('name2'))
.filter("name1 != ''")
.groupBy('name1')
.count()
.toDF('name', 'Occurrence')
)
df2.show()
+-----------------+----------+
| name|Occurrence|
+-----------------+----------+
|Meindert I Tholen| 2|
| Jeanny Thorn| 3|
| Luc Krier| 2|
| Teddy E Beecher| 1|
|Philippe Schauss| 1|
| Ben Weller| 1|
| Liam Muller| 1|
+-----------------+----------+
There are better ways to do it. One naive way of doing it is as follows
from collections import Counter
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("OccurenceCount").getOrCreate()
df = spark.createDataFrame([
["Luc Krier","Jeanny Thorn"],
["Jeanny Thorn","Ben Weller"],
[ "Teddy E Beecher","Luc Krier"],
["Philippe Schauss","Jeanny Thorn"],
["Meindert I Tholen","Liam Muller"],
["Meindert I Tholen",""]
]).toDF("name1", "name2")
counter_dict = dict(Counter(df.select("name1", "name2").rdd.flatMap(lambda x: x).collect()))
counter_list = list(map(list, counter_dict.items()))
frequency_df = spark.createDataFrame(counter_list, ["name", "Occurrence"])
frequency_df.show()
Output:
+-----------------+----------+
| name|Occurrence|
+-----------------+----------+
| | 1|
| Liam Muller| 1|
| Teddy E Beecher| 1|
| Ben Weller| 1|
| Jeanny Thorn| 3|
| Luc Krier| 2|
|Philippe Schauss| 1|
|Meindert I Tholen| 2|
+-----------------+----------+
Does this work?
# Groupby & count both dataframes individually to reduce size.
df_name1 = (df.groupby(['name1']).count()
.withColumnRenamed('name1', 'name')
.withColumnRenamed('count', 'count1'))
df_name2 = (df.groupby(['name2']).count()
.withColumnRenamed('name2', 'name')
.withColumnRenamed('count', 'count2'))
# Join the two dataframes containing frequency counts
# Any null value in the 'count' column can be correctly interpreted as zero.
df_count = (df_name1.join(df_name2, on=['name'], how='outer')
.fillna(0, subset=['count1', 'count2']))
# Sum the two counts and drop the useless columns
df_count = (df_count.withColumn('count', df_count['count1'] + df_count['count2'])
.drop('count1').drop('count2').dropna(subset=['name']))
# (Optional) While any rows with a null name have been removed, rows with an
# empty string ("") for a name are still there. We can drop the empty name
# rows like this.
df_count = df_count[df_count['name'] != '']
df_count.show()
# +-----------------+-----+
# | name|count|
# +-----------------+-----+
# |Meindert I Tholen| 2|
# | Jeanny Thorn| 3|
# | Luc Krier| 2|
# | Teddy E Beecher| 1|
# |Philippe Schauss| 1|
# | Ben Weller| 1|
# | Liam Muller| 1|
# +-----------------+-----+
You can get the required output as follows in scala :
import org.apache.spark.sql.functions._
val df = Seq(
("Luc Krier","Jeanny Thorn"),
("Jeanny Thorn","Ben Weller"),
( "Teddy E Beecher","Luc Krier"),
("Philippe Schauss","Jeanny Thorn"),
("Meindert I Tholen","Liam Muller"),
("Meindert I Tholen","")
).toDF("name1", "name2")
val df1 = df.filter($"name1".isNotNull).filter($"name1" !==
"").groupBy("name1").agg(count("name1").as("count1"))
val df2 = df.filter($"name2".isNotNull).filter($"name2" !==
"").groupBy("name2").agg(count("name2").as("count2"))
val newdf = df1.join(df2, $"name1" === $"name2","outer").withColumn("count1",
when($"count1".isNull,0).otherwise($"count1")).withColumn("count2",
when($"count2".isNull,0).otherwise($"count2")).withColumn("Count",$"count1" +
$"count2")
val finalDF =newdf.withColumn("name",when($"name1".isNull,$"name2")
.when($"name2".isNull,$"name1").otherwise($"name1")).select("name","Count")
display(finalDF)
You can see the final output as image below :

Python Spark join two dataframes and fill column

I have two dataframes that need to be joined in a particular way I am struggling with.
dataframe 1:
+--------------------+---------+----------------+
| asset_domain| eid| oid|
+--------------------+---------+----------------+
| test-domain...| 126656| 126656|
| nebraska.aaa.com| 335660| 335660|
| netflix.com| 460| 460|
+--------------------+---------+----------------+
dataframe 2:
+--------------------+--------------------+---------+--------------+----+----+------------+
| asset| asset_domain|dns_count| ip| ev|post|form_present|
+--------------------+--------------------+---------+--------------+----+----+------------+
| sub1.test-domain...| test-domain...| 6354| 11.11.111.111| 1| 1| null|
| netflix.com| netflix.com| 3836| 22.22.222.222|null|null| null|
+--------------------+--------------------+---------+--------------+----+----+------------+
desired result:
+--------------------+---------+-------------+----+----+------------+---------+----------------+
| asset|dns_count| ip| ev|post|form_present| eid| oid|
+--------------------+---------+-------------+----+----+------------+---------+----------------+
| netflix.com| 3836|22.22.222.222|null|null| null| 460| 460|
| sub1.test-domain...| 5924|111.11.111.11| 1| 1| null| 126656| 126656|
| nebraska.aaa.com| null| null|null|null| null| 335660| 335660|
+--------------------+---------+-------------+----+----+------------+---------+----------------+
Basically – it should join df1 and df2 on asset_domain but if that doesn't exist in df2, then the resulting asset should be the asset_domain from df1.
I tried df = df2.join(df1, ["asset_domain"], "right").drop("asset_domain") but that obviously leaves null in the asset column for nebraska.aaa.com since it does not have a matching domain in df2. How do I go about adding those to the asset column for this particular case?
you can use coalesce function after join to create asset column.
df2.join(df1, ["asset_domain"], "right").select(coalesce("asset","asset_domain").alias("asset"),"dns_count","ip","ev","post","form_present","eid","oid").orderBy("asset").show()
#+----------------+---------+-------------+----+----+------------+------+------+
#| asset|dns_count| ip| ev|post|form_present| eid| oid|
#+----------------+---------+-------------+----+----+------------+------+------+
#|nebraska.aaa.com| null| null|null|null| null|335660|335660|
#| netflix.com| 3836|22.22.222.222|null|null| None| 460| 460|
#|sub1.test-domain| 6354|11.11.111.111| 1| 1| null|126656|126656|
#+----------------+---------+-------------+----+----+------------+------+------+
After the join you can use the isNull() function
import pyspark.sql.functions as F
tst1 = sqlContext.createDataFrame([('netflix',1),('amazon',2)],schema=("asset_domain",'xtra1'))
tst2= sqlContext.createDataFrame([('netflix','yahoo',1),('amazon','yahoo',2),('flipkart',None,2)],schema=("asset_domain","asset",'xtra'))
tst_j = tst1.join(tst2,on='asset_domain',how='right')
#%%
tst_res = tst_j.withColumn("asset",F.when(F.col('asset').isNull(),F.col('asset_domain')).otherwise(F.col('asset')))

Apache Spark: Get the first and last row of each partition

I would like to get the first and last row of each partition in spark (I'm using pyspark). How do I go about this?
In my code I repartition my dataset based on a key column using:
mydf.repartition(keyColumn).sortWithinPartitions(sortKey)
Is there a way to get the first row and last row for each partition?
Thanks
I would highly advise against working with partitions directly. Spark does a lot of DAG optimisation, so when you try executing specific functionality on each partition, all your assumptions about the partitions and their distribution might be completely false.
You seem to however have a keyColumn and sortKey, so then I'd just suggest to do the following:
import pyspark
import pyspark.sql.functions as f
w_asc = pyspark.sql.Window.partitionBy(keyColumn).orderBy(f.asc(sortKey))
w_desc = pyspark.sql.Window.partitionBy(keyColumn).orderBy(f.desc(sortKey))
res_df = mydf. \
withColumn("rn_asc", f.row_number().over(w_asc)). \
withColumn("rn_desc", f.row_number().over(w_desc)). \
where("rn_asc = 1 or rn_desc = 1")
The resulting dataframe will have 2 additional columns, where rn_asc=1 indicates the first row and rn_desc=1 indicates the last row.
Scala: I think the repartition is not by come key column but it requires the integer how may partition you want to set. I made a way to select the first and last row by using the Window function of the spark.
First, this is my test data.
+---+-----+
| id|value|
+---+-----+
| 1| 1|
| 1| 2|
| 1| 3|
| 1| 4|
| 2| 1|
| 2| 2|
| 2| 3|
| 3| 1|
| 3| 3|
| 3| 5|
+---+-----+
Then, I use the Window function twice, because I cannot know the last row easily but the reverse is quite easy.
import org.apache.spark.sql.expressions.Window
val a = Window.partitionBy("id").orderBy("value")
val d = Window.partitionBy("id").orderBy(col("value").desc)
val df = spark.read.option("header", "true").csv("test.csv")
df.withColumn("marker", when(rank.over(a) === 1, "Y").otherwise("N"))
.withColumn("marker", when(rank.over(d) === 1, "Y").otherwise(col("marker")))
.filter(col("marker") === "Y")
.drop("marker").show
The final result is then,
+---+-----+
| id|value|
+---+-----+
| 3| 5|
| 3| 1|
| 1| 4|
| 1| 1|
| 2| 3|
| 2| 1|
+---+-----+
Here is another approach using mapPartitions from RDD API. We iterate over the elements of each partition until we reach the end. I would expect this iteration to be very fast since we skip all the elements of the partition except the two edges. Here is the code:
df = spark.createDataFrame([
["Tom", "a"],
["Dick", "b"],
["Harry", "c"],
["Elvis", "d"],
["Elton", "e"],
["Sandra", "f"]
], ["name", "toy"])
def get_first_last(it):
first = last = next(it)
for last in it:
pass
# Attention: if first equals last by reference return only one!
if first is last:
return [first]
return [first, last]
# coalesce here is just for demonstration
first_last_rdd = df.coalesce(2).rdd.mapPartitions(get_first_last)
spark.createDataFrame(first_last_rdd, ["name", "toy"]).show()
# +------+---+
# | name|toy|
# +------+---+
# | Tom| a|
# | Harry| c|
# | Elvis| d|
# |Sandra| f|
# +------+---+
PS: Odd positions will contain the first partition element and the even ones the last item. Also note that the number of results will be (numPartitions * 2) - numPartitionsWithOneItem which I expect to be relatively small therefore you shouldn't bother about the cost of the new createDataFrame statement.

How to do a conditional aggregation after a groupby in pyspark dataframe?

I'm trying to group by an ID column in a pyspark dataframe and sum a column depending on the value of another column.
To illustrate, consider the following dummy dataframe:
+-----+-------+---------+
| ID| type| amount|
+-----+-------+---------+
| 1| a| 55|
| 2| b| 1455|
| 2| a| 20|
| 2| b| 100|
| 3| null| 230|
+-----+-------+---------+
My desired output is:
+-----+--------+----------+----------+
| ID| sales| sales_a| sales_b|
+-----+--------+----------+----------+
| 1| 55| 55| 0|
| 2| 1575| 20| 1555|
| 3| 230| 0| 0|
+-----+--------+----------+----------+
So basically, sales will be the sum of amount, while sales_a and sales_b are the sum of amount when type is a or b respectively.
For sales, I know this could be done like this:
from pyspark.sql import functions as F
df = df.groupBy("ID").agg(F.sum("amount").alias("sales"))
For the others, I'm guessing F.when would be useful but I'm not sure how to go about it.
You could create two columns before the aggregation based off of the value of type.
df.withColumn("sales_a", F.when(col("type") == "a", col("amount"))) \
.withColumn("sales_b", F.when(col("type") == "b", col("amount"))) \
.groupBy("ID") \
.agg(F.sum("amount").alias("sales"),
F.sum("sales_a").alias("sales_a"),
F.sum("sales_b").alias("sales_b"))
from pyspark.sql import functions as F
df = df.groupBy("ID").agg(F.sum("amount").alias("sales"))
dfPivot = df.filter("type is not null").groupBy("ID").pivot("type").agg(F.sum("amount").alias("sales"))
res = df.join(dfPivot, df.id== dfPivot.id,how='left')
Then replace null with 0.
This is generic solution will work irrespective of values in type column.. so if type c is added in dataframe then it will create column _c

Aggregating List of Dicts in Spark DataFrame

How can I perform aggregations and analysis on column in a Spark DF that was created from column that contained multiple dictionaries such as the below:
rootKey=[Row(key1='value1', key2='value2', key3='value3'), Row(key1='value1', key2='value2', key3='value3'), Row(key1='value1', key2='value2', key3='value3'), Row(key1='value1', key2='value2', key3='value3')]
Here is an example of what the column looks like:
>>> df.select('column').show(20, False)
+-----------------------------------------------------------------+
|column |
+-----------------------------------------------------------------+
|[[1,1,1], [1,2,6], [1,2,13], [1,3,3]] |
|[[2,1,1], [2,3,6], [2,4,10]] |
|[[1,1,1], [1,1,6], [1,2,1], [2,2,2], [2,3,6], [1,3,7], [2,4,10]] |
An example would be to summarize all of the key values and groupBy a different column.
You need f.explode:
json_file.json:
{"idx":1, "col":[{"k":1,"v1":1,"v2":1},{"k":1,"v1":2,"v2":6},{"k":1,"v1":2,"v2":13},{"k":1,"v1":2,"v2":2}]}
{"idx":2, "col":[{"k":2,"v1":1,"v2":1},{"k":2,"v1":3,"v2":6},{"k":2,"v1":4,"v2":10}]}
from pyspark.sql import functions as f
df = spark.read.load('file:///home/zht/PycharmProjects/test/json_file.json', format='json')
df = df.withColumn('col', f.explode(df['col']))
df = df.groupBy(df['col']['v1']).sum('col.k')
df.show()
# output:
+---------+-----------------+
|col['v1']|sum(col.k AS `k`)|
+---------+-----------------+
| 1| 3|
| 3| 2|
| 2| 3|
| 4| 2|
+---------+-----------------+

Resources