how can I create a pyspark udf using multiple columns? - apache-spark

I need to write some custum code using multiple columns within a group of my data.
My custom code is to set a flag if a value is over a threshold, but suppress the flag if it is within a certain time of a previous flag.
Here is some sample code:
df = spark.createDataFrame(
[
("a", 1, 0),
("a", 2, 1),
("a", 3, 1),
("a", 4, 1),
("a", 5, 1),
("a", 6, 0),
("a", 7, 1),
("a", 8, 1),
("b", 1, 0),
("b", 2, 1)
],
["group_col","order_col", "flag_col"]
)
df.show()
+---------+---------+--------+
|group_col|order_col|flag_col|
+---------+---------+--------+
| a| 1| 0|
| a| 2| 1|
| a| 3| 1|
| a| 4| 1|
| a| 5| 1|
| a| 6| 0|
| a| 7| 1|
| a| 8| 1|
| b| 1| 0|
| b| 2| 1|
+---------+---------+--------+
from pyspark.sql.functions import udf, col, asc
from pyspark.sql.window import Window
def _suppress(dates=None, alert_flags=None, window=2):
sup_alert_flag = alert_flag
last_alert_date = None
for i, alert_flag in enumerate(alert_flag):
current_date = dates[i]
if alert_flag == 1:
if not last_alert_date:
sup_alert_flag[i] = 1
last_alert_date = current_date
elif (current_date - last_alert_date) > window:
sup_alert_flag[i] = 1
last_alert_date = current_date
else:
sup_alert_flag[i] = 0
else:
alert_flag = 0
return sup_alert_flag
suppress_udf = udf(_suppress, DoubleType())
df_out = df.withColumn("supressed_flag_col", suppress_udf(dates=col("order_col"), alert_flags=col("flag_col"), window=4).Window.partitionBy(col("group_col")).orderBy(asc("order_col")))
df_out.show()
The above fails, but my expected output is the following:
+---------+---------+--------+------------------+
|group_col|order_col|flag_col|supressed_flag_col|
+---------+---------+--------+------------------+
| a| 1| 0| 0|
| a| 2| 1| 1|
| a| 3| 1| 0|
| a| 4| 1| 0|
| a| 5| 1| 0|
| a| 6| 0| 0|
| a| 7| 1| 1|
| a| 8| 1| 0|
| b| 1| 0| 0|
| b| 2| 1| 1|
+---------+---------+--------+------------------+

Editing answer after more thought.
The general problem seems to be that the result of the current row depends upon result of the previous row. In effect, there is a recurrence relationship. I haven't found a good way to implement a recursive UDF in Spark. There are several challenges that result from the assumed distributed nature of the data in Spark which would make this difficult to achieve. At least in my mind. The following solution should work but may not scale for large data sets.
from pyspark.sql import Row
import pyspark.sql.functions as F
import pyspark.sql.types as T
suppress_flag_row = Row("order_col", "flag_col", "res_flag")
def suppress_flag( date_alert_flags, window_size ):
sorted_alerts = sorted( date_alert_flags, key=lambda x: x["order_col"])
res_flags = []
last_alert_date = None
for row in sorted_alerts:
current_date = row["order_col"]
aflag = row["flag_col"]
if aflag == 1 and (not last_alert_date or (current_date - last_alert_date) > window_size):
res = suppress_flag_row(current_date, aflag, True)
last_alert_date = current_date
else:
res = suppress_flag_row(current_date, aflag, False)
res_flags.append(res)
return res_flags
in_fields = [T.StructField("order_col", T.IntegerType(), nullable=True )]
in_fields.append( T.StructField("flag_col", T.IntegerType(), nullable=True) )
out_fields = in_fields
out_fields.append(T.StructField("res_flag", T.BooleanType(), nullable=True) )
out_schema = T.StructType(out_fields)
suppress_udf = F.udf(suppress_flag, T.ArrayType(out_schema) )
window_size = 4
tmp = df.groupBy("group_col").agg( F.collect_list( F.struct( F.col("order_col"), F.col("flag_col") ) ).alias("date_alert_flags"))
tmp2 = tmp.select(F.col("group_col"), suppress_udf(F.col("date_alert_flags"), F.lit(window_size)).alias("suppress_res"))
expand_fields = [F.col("group_col")] + [F.col("res_expand")[f.name].alias(f.name) for f in out_fields]
final_df = tmp2.select(F.col("group_col"), F.explode(F.col("suppress_res")).alias("res_expand")).select( expand_fields )

I think, You don't need custom function for this. you can use rowsBetween option along with window to get the 5 rows range. Please check and let me know if missed something.
>>> from pyspark.sql import functions as F
>>> from pyspark.sql import Window
>>> w = Window.partitionBy('group_col').orderBy('order_col').rowsBetween(-5,-1)
>>> df = df.withColumn('supr_flag_col',F.when(F.sum('flag_col').over(w) == 0,1).otherwise(0))
>>> df.orderBy('group_col','order_col').show()
+---------+---------+--------+-------------+
|group_col|order_col|flag_col|supr_flag_col|
+---------+---------+--------+-------------+
| a| 1| 0| 0|
| a| 2| 1| 1|
| a| 3| 1| 0|
| b| 1| 0| 0|
| b| 2| 1| 1|
+---------+---------+--------+-------------+

Related

PySpark: Case When Groupby

Consider this MWE:
df = spark.createDataFrame([('A', 5, 0),('A',6, 0),('B',3, 0)], ['id', 'value', 'currentVersion'])
+---+-----+--------------+
| id|value|currentVersion|
+---+-----+--------------+
| A| 5| 0|
| A| 6| 0|
| B| 3| 0|
+---+-----+--------------+
With this expected output
#+---+-----+----------+
#| id|value|currentVersion|
#+---+-----+----------+
#| A| 5| 0|
#| A| 6| 1|
#| B| 0| 0 |
#+---+-----+----------+
How can I get to the expected output while relying on groupby?
This works well for my other purposes, but fails as I need to incorporate groupby:
valueWhenTrue = 1
valueWhenFalse = 0
df = df.withColumn(
"currentVersion",
when(
F.col("TimeStamp") == df.agg({"TimeStamp": "max"}).collect()[0][0],
valueWhenTrue
).otherwise(valueWhenFalse)
)
Found an answer that works for me:
# groupby -- find max time
window_var = Window().partitionBy('TicketNumber')
df = df.withColumn('maxModified', F.max('Modified').over(window_var))
# case when
valueWhenTrue = 1
valueWhenFalse = 0
df = df.withColumn(
"currentVersion",
when(
F.col("maxModified") == F.col('Modified'),
valueWhenTrue
).otherwise(valueWhenFalse)
)

Grouping consecutive rows where date difference is 1 day

I have managed to get the dataframe with these columns:
+----------+----------+--------+
| date| next_date|datediff|
+----------+----------+--------+
|2020-09-25|2020-09-30| 5|
|2020-09-30|2020-10-01| 1|
|2020-10-01|2020-10-02| 1|
|2020-10-02|2020-10-03| 1|
|2020-10-03|2020-10-04| 1|
|2020-10-09|2020-11-23| 45|
|2020-11-23|2020-11-24| 1|
|2020-11-24|2020-11-25| 1|
|2020-11-25|2020-11-26| 1|
+----------+----------+--------+
I got the 'group' column by doing these commands:
w1 = Window.orderBy("date")
df_dates.withColumn(
"dateChange",
(F.col("datediff") != F.lit(1)).cast("int")
)\
.fillna(
0,
subset=["dateChange"]
)\
.withColumn(
"indicator",
(~((F.col("dateChange")==0))).cast("int")
)\
.withColumn(
"group",
F.sum(F.col("indicator")).over(w1.rangeBetween(Window.unboundedPreceding, 0))
)
and finally got these groupings:
+----------+----------+--------+----------+---------+-----+
| date| next_date|datediff|dateChange|indicator|group|
+----------+----------+--------+----------+---------+-----+
|2020-09-25|2020-09-30| 5| 1| 1| 1|
|2020-09-30|2020-10-01| 1| 0| 0| 1|
|2020-10-01|2020-10-02| 1| 0| 0| 1|
|2020-10-02|2020-10-03| 1| 0| 0| 1|
|2020-10-03|2020-10-04| 1| 0| 0| 1|
|2020-10-09|2020-11-23| 45| 1| 1| 2|
|2020-11-23|2020-11-24| 1| 0| 0| 2|
|2020-11-24|2020-11-25| 1| 0| 0| 2|
|2020-11-25|2020-11-26| 1| 0| 0| 2|
+----------+----------+--------+----------+---------+-----+
However, the first row should have its own group. The second row should be group 2 (all incremented by 1).
Then I do the aggregation:
df_dates.groupBy("group")\
.agg(
F.min("next_date").alias("start_time"),
F.max("next_date").alias("end_time")
)\
.drop("group")\
.show()
+----------+----------+
|start_time| end_time|
+----------+----------+
|2020-09-30|2020-10-04|
|2020-11-23|2020-11-26|
+----------+----------+
But I am missing the first group which is 2020-09-25.
The aim for this is to get the ranges for consecutive dates to help me combine HDFS folders with consecutive dates into the same partition.
Rewritten example data as python script:
from pyspark.sql import functions as F, Window as W
df = spark.createDataFrame(
[('2020-09-25', '2020-09-30', 5),
('2020-09-30', '2020-10-01', 1),
('2020-10-01', '2020-10-02', 1),
('2020-10-02', '2020-10-03', 1),
('2020-10-03', '2020-10-04', 1),
('2020-10-09', '2020-11-23', 45),
('2020-11-23', '2020-11-24', 1),
('2020-11-24', '2020-11-25', 1),
('2020-11-25', '2020-11-26', 1)],
["date", "next_date", "datediff"])
The following creates groups using window functions lag and sum:
w = W.orderBy("date")
# _flg is the rule when subgroup inside partition must be created
df = df.withColumn("_flg", F.coalesce(F.when(F.col("datediff") != F.lag("datediff").over(w), 1), F.lit(0)))
df = df.withColumn("_grp", F.sum("_flg").over(w))
df.show()
# +----------+----------+--------+----+----+
# | date| next_date|datediff|_flg|_grp|
# +----------+----------+--------+----+----+
# |2020-09-25|2020-09-30| 5| 0| 0|
# |2020-09-30|2020-10-01| 1| 1| 1|
# |2020-10-01|2020-10-02| 1| 0| 1|
# |2020-10-02|2020-10-03| 1| 0| 1|
# |2020-10-03|2020-10-04| 1| 0| 1|
# |2020-10-09|2020-11-23| 45| 1| 2|
# |2020-11-23|2020-11-24| 1| 1| 3|
# |2020-11-24|2020-11-25| 1| 0| 3|
# |2020-11-25|2020-11-26| 1| 0| 3|
# +----------+----------+--------+----+----+
Lastly, grouping using the created "_grp" column and others when applicable:
df = (df
.groupBy("_grp")
.agg(
F.min("date").alias("start_time"),
F.max("next_date").alias("end_time")
).drop("_grp")
)
df.show()
# +----------+----------+
# |start_time| end_time|
# +----------+----------+
# |2020-09-25|2020-09-30|
# |2020-09-30|2020-10-04|
# |2020-10-09|2020-11-23|
# |2020-11-23|2020-11-26|
# +----------+----------+

How to set the value of a Pyspark column based on two conditions of the value of another column

Say I have a dataframe:
+-----+-----+-----+
|id |foo. |bar. |
+-----+-----+-----+
| 1| baz| 0|
| 2| baz| 0|
| 3| 333| 2|
| 4| 444| 1|
+-----+-----+-----+
I want to set the 'foo' column to a value depending on the value of bar.
If bar is 2: set the value of foo for that row to 'X',
else if bar is 1: set the value of foo for that row to 'Y'
And if neither condition is met, leave the foo value as it is.
pyspark.when seems like the closest method, but that doesn't seem to work based on another columns value.
when can work with other columns. You can use F.col to get the value of the other column and provide an appropriate condition:
import pyspark.sql.functions as F
df2 = df.withColumn(
'foo',
F.when(F.col('bar') == 2, 'X')
.when(F.col('bar') == 1, 'Y')
.otherwise(F.col('foo'))
)
df2.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+
We can solve this using when òr UDF in spark to insert new column based on condition.
Create Sample DataFrame:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('AddConditionalColumn').getOrCreate()
data = [(1,"baz",0),(2,"baz",0),(3,"333",2),(4,"444",1)]
columns = ["id","foo","bar"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3|333| 2|
| 4|444| 1|
+---+---+---+
Using When:
from pyspark.sql.functions import when
df2 = df.withColumn("foo", when(df.bar == 2,"X")
.when(df.bar == 1,"Y")
.otherwise(df.foo))
df2.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1|baz| 0|
| 2|baz| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+
Using UDF:
import pyspark.sql.functions as F
from pyspark.sql.types import *
def executeRule(value):
if value == 2:
return 'X'
elif value == 1:
return 'Y'
else:
return value
# Converting function to UDF
ruleUDF = F.udf(executeRule, StringType())
df3 = df.withColumn("foo", ruleUDF("bar"))
df3.show()
+---+---+---+
| id|foo|bar|
+---+---+---+
| 1| 0| 0|
| 2| 0| 0|
| 3| X| 2|
| 4| Y| 1|
+---+---+---+

Extracting value using Window and Partition

I have a dataframe in pyspark
id | value
1 0
1 1
1 0
2 1
2 0
3 0
3 0
3 1
I want to extract all the rows after the first occurrence of 1 in value column in the same id group. I have created Window with partition of Id but do not know how to get rows which are present after value 1.
Im expecting result to be
id | value
1 1
1 0
2 1
2 0
3 1
Below solutions may be relevant for this (It is working perfectly for small data but may cause the problem in big data if id are on multiple partitions)
df = sqlContext.createDataFrame([
[1, 0],
[1, 1],
[1, 0],
[2, 1],
[2, 0],
[3, 0],
[3, 0],
[3, 1]
],
['id', 'Value']
)
df.show()
+---+-----+
| id|Value|
+---+-----+
| 1| 0|
| 1| 1|
| 1| 0|
| 2| 1|
| 2| 0|
| 3| 0|
| 3| 0|
| 3| 1|
+---+-----+
#importing Libraries
from pyspark.sql import functions as F
from pyspark.sql.window import Window as W
import sys
#This way we can generate a cumulative sum for values
df.withColumn(
"sum",
F.sum(
"value"
).over(W.partitionBy(["id"]).rowsBetween(-sys.maxsize, 0))
).show()
+---+-----+-----+
| id|Value|sum |
+---+-----+-----+
| 1| 0| 0|
| 1| 1| 1|
| 1| 0| 1|
| 3| 0| 0|
| 3| 0| 0|
| 3| 1| 1|
| 2| 1| 1|
| 2| 0| 1|
+---+-----+-----+
#Filter all those which are having sum > 0
df.withColumn(
"sum",
F.sum(
"value"
).over(W.partitionBy(["id"]).rowsBetween(-sys.maxsize, 0))
).where("sum > 0").show()
+---+-----+-----+
| id|Value|sum |
+---+-----+-----+
| 1| 1| 1|
| 1| 0| 1|
| 3| 1| 1|
| 2| 1| 1|
| 2| 0| 1|
+---+-----+-----+
Before running this you must be sure that data related to ID should be partitioned and no id can be on 2 partitions.
Ideally, you would need to:
Create a window partitioned by id and ordered the same way the dataframe already is
Keep only the rows for which there is a "one" before them in the window
AFAIK, there is no look up function within windows in Spark. Yet, you could follow this idea and work something out. Let's first create the data and import functions and windows.
import pyspark.sql.functions as F
from pyspark.sql.window import Window
l = [(1, 0), (1, 1), (1, 0), (2, 1), (2, 0), (3, 0), (3, 0), (3, 1)]
df = spark.createDataFrame(l, ['id', 'value'])
Then, let's add an index on the dataframe (it's free) to be able to order the windows.
indexedDf = df.withColumn("index", F.monotonically_increasing_id())
Then we create a window that only looks at the values before the current row, ordered by that index and partitioned by id.
w = Window.partitionBy("id").orderBy("index").rowsBetween(Window.unboundedPreceding, 0)
Finally, we use that window to collect the set of preceding values of each row, and filter out the ones that do not contain 1. Optionally, we order back by index because the windowing does not preserve the order by id column.
indexedDf\
.withColumn('set', F.collect_set(F.col('value')).over(w))\
.where(F.array_contains(F.col('set'), 1))\
.orderBy("index")\
.select("id", "value").show()
+---+-----+
| id|value|
+---+-----+
| 1| 1|
| 1| 0|
| 2| 1|
| 2| 0|
| 3| 1|
+---+-----+

How to create data frame from list in pyspark without using for loop?

I have list like below:
rrr=[[(1,(3,1)),(2, (3,2)),(3, (3, 2)),(1,(4,1)),(2, (4,2))]]
df_input = []
and next I defined header like below:
df_header=['sid', 'tid', 'srank']
Using for loop appending the data into the empty list:
for i in rrr:
for j in i:
df_input.append((j[0], j[1][0], j[1][1]))
df_input
Output : [(1, 3, 1), (2, 3, 2), (3, 3, 2)]
Create Data Frame like below:
df = spark.createDataFrame(df_input, df_header)
df.show()
+---+---+------+
| sid|tid|srank|
+---+---+------+
| 1| 3| 1|
| 2| 3| 2|
| 3| 3| 2|
+---+---+------+
Now my question is how to Create Data Frame without using any external for loop(like above). Input list contains more then 1 Lack records.
When you realize that your initial list is a nested one. i.e. an actual list as a unique element of an outer one, then you'll see that the solution comes easily by taking only its first (and only) element into consideration:
spark.version
# u'2.1.1'
from pyspark.sql import Row
# your exact data:
rrr=[[(1,(3,1)),(2, (3,2)),(3, (3, 2)),(1,(4,1)),(2, (4,2))]]
df_header=['sid', 'tid', 'srank']
df = sc.parallelize(rrr[0]).map(lambda x: Row(x[0], x[1][0],x[1][1])).toDF(schema=df_header)
df.show()
# +---+---+-----+
# |sid|tid|srank|
# +---+---+-----+
# | 1| 3| 1|
# | 2| 3| 2|
# | 3| 3| 2|
# | 1| 4| 1|
# | 2| 4| 2|
# +---+---+-----+
Solution one: to introduce toDF() transformation (but with input modified)
from pyspark.sql import Row
ar=[[1,(3,1)],[2, (3,2)],[3, (3,2)]]
sc.parallelize(ar).map(lambda x: Row(sid=x[0], tid=x[1][0],srank=x[1][1])).toDF().show()
+---+-----+---+
|sid|srank|tid|
+---+-----+---+
| 1| 1| 3|
| 2| 2| 3|
| 3| 2| 3|
+---+-----+---+
Solution 2: with the requested input matrix use list comprehension, numpy flatten and reshape
import numpy as np
x=[[(1,(3,1)),(2, (3,2)),(3, (3, 2))]]
ar=[[(j[0],j[1][0],j[1][1]) for j in i] for i in x]
flat=np.array(ar).flatten()
flat=flat.reshape(len(flat)/3, 3)
sc.parallelize(flat).map(lambda x: Row(sid=int(x[0]),tid=int(x[1]),srank=int(x[2]))).toDF().show()
+---+-----+---+
|sid|srank|tid|
+---+-----+---+
| 1| 1| 3|
| 2| 2| 3|
| 3| 2| 3|
+---+-----+---+
#works also with N,M matrix
number_columns=3
x=[[(1,(3,1)),(2, (3,2)),(3, (3, 2))],[(5,(6,7)),(8, (9,10)),(11, (12, 13))]]
ar=[[(j[0],j[1][0],j[1][1]) for j in i] for i in x]
flat=np.array(ar).flatten()
flat=flat.reshape(int(len(flat)/number_columns), number_columns)
sc.parallelize(flat).map(lambda x: Row(sid=int(x[0]),tid=int(x[1]),srank=int(x[2]))).toDF().show()
+---+-----+---+
|sid|srank|tid|
+---+-----+---+
| 1| 1| 3|
| 2| 2| 3|
| 3| 2| 3|
| 5| 7| 6|
| 8| 10| 9|
| 11| 13| 12|
+---+-----+---+

Resources