PySpark: Case When Groupby - apache-spark

Consider this MWE:
df = spark.createDataFrame([('A', 5, 0),('A',6, 0),('B',3, 0)], ['id', 'value', 'currentVersion'])
+---+-----+--------------+
| id|value|currentVersion|
+---+-----+--------------+
| A| 5| 0|
| A| 6| 0|
| B| 3| 0|
+---+-----+--------------+
With this expected output
#+---+-----+----------+
#| id|value|currentVersion|
#+---+-----+----------+
#| A| 5| 0|
#| A| 6| 1|
#| B| 0| 0 |
#+---+-----+----------+
How can I get to the expected output while relying on groupby?
This works well for my other purposes, but fails as I need to incorporate groupby:
valueWhenTrue = 1
valueWhenFalse = 0
df = df.withColumn(
"currentVersion",
when(
F.col("TimeStamp") == df.agg({"TimeStamp": "max"}).collect()[0][0],
valueWhenTrue
).otherwise(valueWhenFalse)
)

Found an answer that works for me:
# groupby -- find max time
window_var = Window().partitionBy('TicketNumber')
df = df.withColumn('maxModified', F.max('Modified').over(window_var))
# case when
valueWhenTrue = 1
valueWhenFalse = 0
df = df.withColumn(
"currentVersion",
when(
F.col("maxModified") == F.col('Modified'),
valueWhenTrue
).otherwise(valueWhenFalse)
)

Related

Grouping consecutive rows where date difference is 1 day

I have managed to get the dataframe with these columns:
+----------+----------+--------+
| date| next_date|datediff|
+----------+----------+--------+
|2020-09-25|2020-09-30| 5|
|2020-09-30|2020-10-01| 1|
|2020-10-01|2020-10-02| 1|
|2020-10-02|2020-10-03| 1|
|2020-10-03|2020-10-04| 1|
|2020-10-09|2020-11-23| 45|
|2020-11-23|2020-11-24| 1|
|2020-11-24|2020-11-25| 1|
|2020-11-25|2020-11-26| 1|
+----------+----------+--------+
I got the 'group' column by doing these commands:
w1 = Window.orderBy("date")
df_dates.withColumn(
"dateChange",
(F.col("datediff") != F.lit(1)).cast("int")
)\
.fillna(
0,
subset=["dateChange"]
)\
.withColumn(
"indicator",
(~((F.col("dateChange")==0))).cast("int")
)\
.withColumn(
"group",
F.sum(F.col("indicator")).over(w1.rangeBetween(Window.unboundedPreceding, 0))
)
and finally got these groupings:
+----------+----------+--------+----------+---------+-----+
| date| next_date|datediff|dateChange|indicator|group|
+----------+----------+--------+----------+---------+-----+
|2020-09-25|2020-09-30| 5| 1| 1| 1|
|2020-09-30|2020-10-01| 1| 0| 0| 1|
|2020-10-01|2020-10-02| 1| 0| 0| 1|
|2020-10-02|2020-10-03| 1| 0| 0| 1|
|2020-10-03|2020-10-04| 1| 0| 0| 1|
|2020-10-09|2020-11-23| 45| 1| 1| 2|
|2020-11-23|2020-11-24| 1| 0| 0| 2|
|2020-11-24|2020-11-25| 1| 0| 0| 2|
|2020-11-25|2020-11-26| 1| 0| 0| 2|
+----------+----------+--------+----------+---------+-----+
However, the first row should have its own group. The second row should be group 2 (all incremented by 1).
Then I do the aggregation:
df_dates.groupBy("group")\
.agg(
F.min("next_date").alias("start_time"),
F.max("next_date").alias("end_time")
)\
.drop("group")\
.show()
+----------+----------+
|start_time| end_time|
+----------+----------+
|2020-09-30|2020-10-04|
|2020-11-23|2020-11-26|
+----------+----------+
But I am missing the first group which is 2020-09-25.
The aim for this is to get the ranges for consecutive dates to help me combine HDFS folders with consecutive dates into the same partition.
Rewritten example data as python script:
from pyspark.sql import functions as F, Window as W
df = spark.createDataFrame(
[('2020-09-25', '2020-09-30', 5),
('2020-09-30', '2020-10-01', 1),
('2020-10-01', '2020-10-02', 1),
('2020-10-02', '2020-10-03', 1),
('2020-10-03', '2020-10-04', 1),
('2020-10-09', '2020-11-23', 45),
('2020-11-23', '2020-11-24', 1),
('2020-11-24', '2020-11-25', 1),
('2020-11-25', '2020-11-26', 1)],
["date", "next_date", "datediff"])
The following creates groups using window functions lag and sum:
w = W.orderBy("date")
# _flg is the rule when subgroup inside partition must be created
df = df.withColumn("_flg", F.coalesce(F.when(F.col("datediff") != F.lag("datediff").over(w), 1), F.lit(0)))
df = df.withColumn("_grp", F.sum("_flg").over(w))
df.show()
# +----------+----------+--------+----+----+
# | date| next_date|datediff|_flg|_grp|
# +----------+----------+--------+----+----+
# |2020-09-25|2020-09-30| 5| 0| 0|
# |2020-09-30|2020-10-01| 1| 1| 1|
# |2020-10-01|2020-10-02| 1| 0| 1|
# |2020-10-02|2020-10-03| 1| 0| 1|
# |2020-10-03|2020-10-04| 1| 0| 1|
# |2020-10-09|2020-11-23| 45| 1| 2|
# |2020-11-23|2020-11-24| 1| 1| 3|
# |2020-11-24|2020-11-25| 1| 0| 3|
# |2020-11-25|2020-11-26| 1| 0| 3|
# +----------+----------+--------+----+----+
Lastly, grouping using the created "_grp" column and others when applicable:
df = (df
.groupBy("_grp")
.agg(
F.min("date").alias("start_time"),
F.max("next_date").alias("end_time")
).drop("_grp")
)
df.show()
# +----------+----------+
# |start_time| end_time|
# +----------+----------+
# |2020-09-25|2020-09-30|
# |2020-09-30|2020-10-04|
# |2020-10-09|2020-11-23|
# |2020-11-23|2020-11-26|
# +----------+----------+

How to set the value of a Pyspark column based on two conditions of the value of another column

Say I have a dataframe:
+-----+-----+-----+
|id |foo. |bar. |
+-----+-----+-----+
| 1| baz| 0|
| 2| baz| 0|
| 3| 333| 2|
| 4| 444| 1|
+-----+-----+-----+
I want to set the 'foo' column to a value depending on the value of bar.
If bar is 2: set the value of foo for that row to 'X',
else if bar is 1: set the value of foo for that row to 'Y'
And if neither condition is met, leave the foo value as it is.
pyspark.when seems like the closest method, but that doesn't seem to work based on another columns value.
when can work with other columns. You can use F.col to get the value of the other column and provide an appropriate condition:
import pyspark.sql.functions as F
df2 = df.withColumn(
'foo',
F.when(F.col('bar') == 2, 'X')
.when(F.col('bar') == 1, 'Y')
.otherwise(F.col('foo'))
)
df2.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+
We can solve this using when òr UDF in spark to insert new column based on condition.
Create Sample DataFrame:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('AddConditionalColumn').getOrCreate()
data = [(1,"baz",0),(2,"baz",0),(3,"333",2),(4,"444",1)]
columns = ["id","foo","bar"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3|333| 2|
| 4|444| 1|
+---+---+---+
Using When:
from pyspark.sql.functions import when
df2 = df.withColumn("foo", when(df.bar == 2,"X")
.when(df.bar == 1,"Y")
.otherwise(df.foo))
df2.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+
Using UDF:
import pyspark.sql.functions as F
from pyspark.sql.types import *
def executeRule(value):
if value == 2:
return 'X'
elif value == 1:
return 'Y'
else:
return value
# Converting function to UDF
ruleUDF = F.udf(executeRule, StringType())
df3 = df.withColumn("foo", ruleUDF("bar"))
df3.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1| 0| 0|
| 2| 0| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+

Check if a column is consecutive with groupby in pyspark

I have a pyspark dataframe that looks like this:
import pandas as pd
foo = pd.DataFrame({'group': ['a','a','a','b','b','c','c','c'], 'value': [1,2,3,4,5,2,4,5]})
I would like to create a new binary column is_consecutive that indicates if the values in the value column are consecutive by group.
The output should look like this:
foo = pd.DataFrame({'group': ['a','a','a','b','b','c','c','c'], 'value': [1,2,3,4,5,2,4,5],
'is_consecutive': [1,1,1,1,1,0,0,0]})
How could I do that in pyspark?
You can use lag to compare values with the previous row and check if they are consecutive, then use min to determine whether all rows are consecutive in a given group.
from pyspark.sql import functions as F, Window
df2 = df.withColumn(
'consecutive',
F.coalesce(
F.col('value') - F.lag('value').over(Window.partitionBy('group').orderBy('value')) == 1,
F.lit(True)
).cast('int')
).withColumn(
'all_consecutive',
F.min('consecutive').over(Window.partitionBy('group'))
)
df2.show()
+-----+-----+-----------+---------------+
|group|value|consecutive|all_consecutive|
+-----+-----+-----------+---------------+
| c| 2| 1| 0|
| c| 4| 0| 0|
| c| 5| 1| 0|
| b| 4| 1| 1|
| b| 5| 1| 1|
| a| 1| 1| 1|
| a| 2| 1| 1|
| a| 3| 1| 1|
+-----+-----+-----------+---------------+
You can use lead and subtract the same with the existing value then find max of the window, once done , put a condition saying return 0 is max is >1 else return 1
w = Window.partitionBy("group").orderBy(F.monotonically_increasing_id())
(foo.withColumn("Diff",F.lead("value").over(w)-F.col("value"))
.withColumn("is_consecutive",F.when(F.max("Diff").over(w)>1,0).otherwise(1))
.drop("Diff")).show()
+-----+-----+--------------+
|group|value|is_consecutive|
+-----+-----+--------------+
| a| 1| 1|
| a| 2| 1|
| a| 3| 1|
| b| 4| 1|
| b| 5| 1|
| c| 2| 0|
| c| 4| 0|
| c| 5| 0|
+-----+-----+--------------+

Pyspark adding a column of repeating values from a list

I have a pyspark dataframe and want to add a column that adds values from a list in a repeating fashion. If this were just python, I would probably use itertools' cycle function. I don't know how to do this in pyspark.
names = ['Julia', 'Tim', 'Zoe']
My dataframe looks like this:
+-----+------+
| id_A| idx_B|
+-----+------+
| a| 0|
| b| 0|
| b| 2|
| b| 2|
| b| 2|
| b| 2|
+-----+------+
I want it to look like this:
+-----+------+--------+
| id_A| idx_B| names |
+-----+------+--------+
| a| 0| Julia|
| b| 0| Tim|
| b| 2| Zoe|
| b| 2| Julia|
| b| 2| Tim|
| b| 2| Zoe|
+-----+------+--------+
Here's one way.
1 - add a unique incremental id for your dataframe:
df = spark.createDataFrame(
df.rdd.zipWithIndex().map(lambda x: Row(*x[0], x[1]))
).toDF("id_A", "idx_B", "id")
df.show()
#+----+-----+---+
#|id_A|idx_B| id|
#+----+-----+---+
#| a| 0| 0|
#| b| 0| 1|
#| b| 2| 2|
#| b| 2| 3|
#| b| 2| 4|
#| b| 2| 5|
#+----+-----+---+
2 - create dataframe from the list of names:
names_df = spark.createDataFrame([(idx, name) for idx, name in enumerate(names)], ["name_id", "names"])
3 - join using modulo 3 (length of names list) in condition:
from pyspark.sql import functions as F
result = df.join(
names_df,
F.col("id") % 3 == F.col("name_id")
).orderBy("id").drop("id", "name_id")
result.show()
#+----+-----+-----+
#|id_A|idx_B|names|
#+----+-----+-----+
#| a| 0|Julia|
#| b| 0| Tim|
#| b| 2| Zoe|
#| b| 2|Julia|
#| b| 2| Tim|
#| b| 2| Zoe|
#+----+-----+-----+

how can I create a pyspark udf using multiple columns?

I need to write some custum code using multiple columns within a group of my data.
My custom code is to set a flag if a value is over a threshold, but suppress the flag if it is within a certain time of a previous flag.
Here is some sample code:
df = spark.createDataFrame(
[
("a", 1, 0),
("a", 2, 1),
("a", 3, 1),
("a", 4, 1),
("a", 5, 1),
("a", 6, 0),
("a", 7, 1),
("a", 8, 1),
("b", 1, 0),
("b", 2, 1)
],
["group_col","order_col", "flag_col"]
)
df.show()
+---------+---------+--------+
|group_col|order_col|flag_col|
+---------+---------+--------+
| a| 1| 0|
| a| 2| 1|
| a| 3| 1|
| a| 4| 1|
| a| 5| 1|
| a| 6| 0|
| a| 7| 1|
| a| 8| 1|
| b| 1| 0|
| b| 2| 1|
+---------+---------+--------+
from pyspark.sql.functions import udf, col, asc
from pyspark.sql.window import Window
def _suppress(dates=None, alert_flags=None, window=2):
sup_alert_flag = alert_flag
last_alert_date = None
for i, alert_flag in enumerate(alert_flag):
current_date = dates[i]
if alert_flag == 1:
if not last_alert_date:
sup_alert_flag[i] = 1
last_alert_date = current_date
elif (current_date - last_alert_date) > window:
sup_alert_flag[i] = 1
last_alert_date = current_date
else:
sup_alert_flag[i] = 0
else:
alert_flag = 0
return sup_alert_flag
suppress_udf = udf(_suppress, DoubleType())
df_out = df.withColumn("supressed_flag_col", suppress_udf(dates=col("order_col"), alert_flags=col("flag_col"), window=4).Window.partitionBy(col("group_col")).orderBy(asc("order_col")))
df_out.show()
The above fails, but my expected output is the following:
+---------+---------+--------+------------------+
|group_col|order_col|flag_col|supressed_flag_col|
+---------+---------+--------+------------------+
| a| 1| 0| 0|
| a| 2| 1| 1|
| a| 3| 1| 0|
| a| 4| 1| 0|
| a| 5| 1| 0|
| a| 6| 0| 0|
| a| 7| 1| 1|
| a| 8| 1| 0|
| b| 1| 0| 0|
| b| 2| 1| 1|
+---------+---------+--------+------------------+
Editing answer after more thought.
The general problem seems to be that the result of the current row depends upon result of the previous row. In effect, there is a recurrence relationship. I haven't found a good way to implement a recursive UDF in Spark. There are several challenges that result from the assumed distributed nature of the data in Spark which would make this difficult to achieve. At least in my mind. The following solution should work but may not scale for large data sets.
from pyspark.sql import Row
import pyspark.sql.functions as F
import pyspark.sql.types as T
suppress_flag_row = Row("order_col", "flag_col", "res_flag")
def suppress_flag( date_alert_flags, window_size ):
sorted_alerts = sorted( date_alert_flags, key=lambda x: x["order_col"])
res_flags = []
last_alert_date = None
for row in sorted_alerts:
current_date = row["order_col"]
aflag = row["flag_col"]
if aflag == 1 and (not last_alert_date or (current_date - last_alert_date) > window_size):
res = suppress_flag_row(current_date, aflag, True)
last_alert_date = current_date
else:
res = suppress_flag_row(current_date, aflag, False)
res_flags.append(res)
return res_flags
in_fields = [T.StructField("order_col", T.IntegerType(), nullable=True )]
in_fields.append( T.StructField("flag_col", T.IntegerType(), nullable=True) )
out_fields = in_fields
out_fields.append(T.StructField("res_flag", T.BooleanType(), nullable=True) )
out_schema = T.StructType(out_fields)
suppress_udf = F.udf(suppress_flag, T.ArrayType(out_schema) )
window_size = 4
tmp = df.groupBy("group_col").agg( F.collect_list( F.struct( F.col("order_col"), F.col("flag_col") ) ).alias("date_alert_flags"))
tmp2 = tmp.select(F.col("group_col"), suppress_udf(F.col("date_alert_flags"), F.lit(window_size)).alias("suppress_res"))
expand_fields = [F.col("group_col")] + [F.col("res_expand")[f.name].alias(f.name) for f in out_fields]
final_df = tmp2.select(F.col("group_col"), F.explode(F.col("suppress_res")).alias("res_expand")).select( expand_fields )
I think, You don't need custom function for this. you can use rowsBetween option along with window to get the 5 rows range. Please check and let me know if missed something.
>>> from pyspark.sql import functions as F
>>> from pyspark.sql import Window
>>> w = Window.partitionBy('group_col').orderBy('order_col').rowsBetween(-5,-1)
>>> df = df.withColumn('supr_flag_col',F.when(F.sum('flag_col').over(w) == 0,1).otherwise(0))
>>> df.orderBy('group_col','order_col').show()
+---------+---------+--------+-------------+
|group_col|order_col|flag_col|supr_flag_col|
+---------+---------+--------+-------------+
| a| 1| 0| 0|
| a| 2| 1| 1|
| a| 3| 1| 0|
| b| 1| 0| 0|
| b| 2| 1| 1|
+---------+---------+--------+-------------+

Resources