Rolling average of max value in sub-group in pyspark - apache-spark

I try to calculate moving-average of each group over window but I want to only include max-value(over window) of each sub-group into my calculation. Here is my sample data.
df = spark.createDataFrame(
[(1, 'a', 1, 5.0),
(1, 'a', 2, 10.0),
(1, 'a', 3, 25.0),
(1, 'a', 4, 50.0),
(1, 'a', 5, 75.0),
(1, 'b', 3, 100.0),
(1, 'b', 4, 30.0),
(1, 'b', 5, 60.0),
(1, 'b', 6, 90.0),
(1, 'b', 7, 120.0),
(2, 'c', 1, 200.0),
(2, 'c', 2, 400.0),
(2, 'c', 3, 600.0),
(2, 'c', 4, 800.0),
(2, 'c', 5, 1000.0),
(2, 'c', 6, 1200.0),
(2, 'c', 7, 1300.0),
(2, 'c', 8, 1400.0),
(2, 'd', 5, 150.0),
(2, 'd', 6, 250.0),
(2, 'd', 7, 350.0)],
("group", "sub-group","time", "value"))
I use window function and define window as below
w = Window.partitionBy('group').orderBy('time').rangeBetween(-2, -1)
My expected result is below data-frame. Are there anyway to do this calculation?
df = spark.createDataFrame(
[(1, 'a', 1, 5.0, None),
(1, 'a', 2, 10.0, 5.0),
(1, 'a', 3, 25.0, 10.0),
(1, 'a', 4, 50.0, 62.5),
(1, 'a', 5, 75.0, 40.0),
(1, 'b', 3, 100.0, 10.0),
(1, 'b', 4, 30.0, 62.5),
(1, 'b', 5, 60.0, 40.0),
(1, 'b', 6, 90.0, 67.5),
(1, 'b', 7, 120.0, 82.5),
(2, 'c', 1, 200.0, None),
(2, 'c', 2, 400.0, 200.0),
(2, 'c', 3, 600.0, 400.0),
(2, 'c', 4, 800.0, 600.0),
(2, 'c', 5, 1000.0, 800.0),
(2, 'c', 6, 1200.0, 575.0),
(2, 'c', 7, 1300.0, 725.0),
(2, 'c', 8, 1400.0, 825.0),
(2, 'd', 5, 150.0, 800.0),
(2, 'd', 6, 250.0, 575.0),
(2, 'd', 7, 350.0, 725.0)],
("group", "sub-group","time", "value", "avg_max_value"))

I'm not sure to really understand the whole compute process but i made a try (which does not match exactly your manual output) :
compute for each time the max value for the sub group
compte the average for a group and a time
from pyspark.sql import functions as F, Window
df.withColumn(
"value_1",
F.max("value").over(
Window.partitionBy("group", "sub-group").orderBy("time").rangeBetween(-2, -1)
),
).withColumn(
"value_2", F.avg("value_1").over(Window.partitionBy("group", "time"))
).orderBy(
"group sub-group time".split()
).show()
+-----+---------+----+------+-------+-------+
|group|sub-group|time| value|value_1|value_2|
+-----+---------+----+------+-------+-------+
| 1| a| 1| 5.0| null| null|
| 1| a| 2| 10.0| 5.0| 5.0|
| 1| a| 3| 25.0| 10.0| 10.0|
| 1| a| 4| 50.0| 25.0| 62.5|
| 1| a| 5| 75.0| 50.0| 75.0|
| 1| b| 3| 100.0| null| 10.0|
| 1| b| 4| 30.0| 100.0| 62.5|
| 1| b| 5| 60.0| 100.0| 75.0|
| 1| b| 6| 90.0| 60.0| 60.0|
| 1| b| 7| 120.0| 90.0| 90.0|
| 2| c| 1| 200.0| null| null|
| 2| c| 2| 400.0| 200.0| 200.0|
| 2| c| 3| 600.0| 400.0| 400.0|
| 2| c| 4| 800.0| 600.0| 600.0|
| 2| c| 5|1000.0| 800.0| 800.0|
| 2| c| 6|1200.0| 1000.0| 575.0|
| 2| c| 7|1300.0| 1200.0| 725.0|
| 2| c| 8|1400.0| 1300.0| 1300.0|
| 2| d| 5| 150.0| null| 800.0|
| 2| d| 6| 250.0| 150.0| 575.0|
| 2| d| 7| 350.0| 250.0| 725.0|
+-----+---------+----+------+-------+-------+

Related

How to randomize different numbers for subgroup of rows pyspark

I have a pyspark dataframe. I need to randomize values taken from list for all rows within given condition. I did:
df = df.withColumn('rand_col', f.when(f.col('condition_col') == condition, random.choice(my_list)))
but the effect is, that it randomizes only one value and assigns it to all rows:
How can I randomize separately for each row?
You can:
use rand and floor from pyspark.sql.functions to create a random indexing column to index into your my_list
create a column in which the my_list value is repeated
index into that column using f.col
It would look something like this:
import pyspark.sql.functions as f
my_list = [1, 2, 30]
df = spark.createDataFrame(
[
(1, 0),
(2, 1),
(3, 1),
(4, 0),
(5, 1),
(6, 1),
(7, 0),
],
["id", "condition"]
)
df = df.withColumn('rand_index', f.when(f.col('condition') == 1, f.floor(f.rand() * len(my_list))))\
.withColumn('my_list', f.array([f.lit(x) for x in my_list]))\
.withColumn('rand_value', f.when(f.col('condition') == 1, f.col("my_list")[f.col("rand_index")]))
df.show()
+---+---------+----------+----------+----------+
| id|condition|rand_index| my_list|rand_value|
+---+---------+----------+----------+----------+
| 1| 0| null|[1, 2, 30]| null|
| 2| 1| 0|[1, 2, 30]| 1|
| 3| 1| 2|[1, 2, 30]| 30|
| 4| 0| null|[1, 2, 30]| null|
| 5| 1| 1|[1, 2, 30]| 2|
| 6| 1| 2|[1, 2, 30]| 30|
| 7| 0| null|[1, 2, 30]| null|
+---+---------+----------+----------+----------+

Spark dataframe filter with boolean list comprehension

I want to filter a spark dataframe sdf based on several columns being not null.
Imagine I have:
labels = ["A", "B, C"]
This would work:
sdf.where(sf.col(labels[0]).isNotNull() | sf.col(labels[1]).isNotNull() | sf.col(labels[2]).isNotNull())
But I would like to do something similar to a list comprehension if the list was much longer:
sdf.where(any([sf.col(l).isNotNull() for l in labels]))
(this does not work, {ValueError}Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.)
How can I achieve this?
You can use reduce from functools to iterate over your list of columns and apply your logic.
In your case, it looks like you want to grab all the rows where any column has a non-null value (so full null value rows should get filtered away).
from functools import reduce
import pyspark.sql.functions as F
labels = ["A", "B", "C"]
df = spark.createDataFrame(
[
(None, 1, "ABC"),
(1, None, "BCD"),
(None, None, None),
(2, 2, None),
(1, 3, "DEF"),
(2, 1, "EFG"),
(None, None, None),
(2, 2, None),
(None, 3, "HIJ"),
(None, None, None),
(2, 2, None),
(3, 1, "EFG"),
(3, 2, None),
(None, None, None),
(2, 2, None),
(3, 3, "HIJ"),
],
["A", "B", "C"]
)
df.filter(reduce(lambda x, y: x | y, (F.col(x).isNotNull() for x in labels))).show()
+----+----+----+
| A| B| C|
+----+----+----+
|null| 1| ABC|
| 1|null| BCD|
| 2| 2|null|
| 1| 3| DEF|
| 2| 1| EFG|
| 2| 2|null|
|null| 3| HIJ|
| 2| 2|null|
| 3| 1| EFG|
| 3| 2|null|
| 2| 2|null|
| 3| 3| HIJ|
+----+----+----+
As you can see, the rows with all null values are correctly filtered away. This is done by OR-ing the isNotNull() conditions.

PySpark max value for multiple columns

I have the below table:
df = spark.createDataFrame(
[('a', 1, 11, 44),
('b', 2, 21, 33),
('a', 2, 10, 40),
('c', 5, 55, 45),
('b', 4, 22, 35),
('a', 3, 9, 45)],
['id', 'left', 'right', 'centre'])
I need to find and display only the max values as shown below:
[![enter image description here][2]][2]
[![[2]: https://i.stack.imgur.com/q8bGq.png][2]][2]
Simple groupBy and agg:
from pyspark.sql import functions as F
df = df.groupBy('id').agg(
F.max('left').alias('max_left'),
F.max('right').alias('max_right'),
F.max('centre').alias('max_centre'),
)
df.show()
# +---+--------+---------+----------+
# | id|max_left|max_right|max_centre|
# +---+--------+---------+----------+
# | b| 4| 22| 35|
# | a| 3| 11| 45|
# | c| 5| 55| 45|
# +---+--------+---------+----------+
Or slightly more advanced:
df = df.groupBy('id').agg(
*[F.max(c).alias(f'max_{c}') for c in df.columns if c != 'id']
)

Rank values in Spark on a column based on previous values

I have a dataframe like this:
df = spark.createDataFrame(
[
(dt.datetime(2021, 5, 1, 10, 30, 0), 2.15, "a"),
(dt.datetime(2021, 5, 1, 10, 30, 10), 2.12, "a"),
(dt.datetime(2021, 5, 1, 10, 30, 20), 2.13, "a"),
(dt.datetime(2021, 5, 1, 10, 30, 50), 2.14, "a"),
(dt.datetime(2021, 5, 1, 10, 31, 5), 2.13, "a"),
(dt.datetime(2021, 5, 1, 10, 31, 10), 2.16, "a"),
(dt.datetime(2021, 5, 1, 10, 31, 10), 2.16, "b"),
],
["ts", "value", "group"]
)
I want to get the rank of the value column, using all the previous values (ordered by the timestamp ts). For example:
+-------------------+-----+-----+----+
| ts|value|group|rank|
+-------------------+-----+-----+----+
|2021-05-01 10:30:00| 2.15| a| 1|
|2021-05-01 10:30:10| 2.12| a| 1|
|2021-05-01 10:30:20| 2.13| a| 2|
|2021-05-01 10:30:50| 2.14| a| 3|
|2021-05-01 10:31:05| 2.13| a| 2|
|2021-05-01 10:31:10| 2.16| a| 5|
|2021-05-01 10:31:10| 2.16| b| 1|
+-------------------+-----+-----+----+
I tried the following code:
w = (
Window
.partitionBy("group")
.orderBy("ts")
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
)
df.select(
"*",
f.rank().over(w).alias("rank")
).show()
but is basically ranking the columns only on the timestamp.
Any idea how to do it?
rank function ranks the data by orderBy clause, so you cannot rank it by another column. You can use this as an alternative
df = (df
.withColumn("rank", F.array_sort(F.collect_set('value').over(w)))
.withColumn('rank', F.expr("array_position(rank, value)")))
df.show()
+-------------------+-----+-----+----+
| ts|value|group|rank|
+-------------------+-----+-----+----+
|2021-05-01 10:31:10| 2.16| b| 1|
|2021-05-01 10:30:00| 2.15| a| 1|
|2021-05-01 10:30:10| 2.12| a| 1|
|2021-05-01 10:30:20| 2.13| a| 2|
|2021-05-01 10:30:50| 2.14| a| 3|
|2021-05-01 10:31:05| 2.13| a| 2|
|2021-05-01 10:31:10| 2.16| a| 5|
+-------------------+-----+-----+----+
if you want to get dense_rank, use collect_list
Chnage your orderBy() column to value
import datetime as dt
df = spark.createDataFrame(
[
(dt.datetime(2021, 5, 1, 10, 30, 0), 2.15, "a"),
(dt.datetime(2021, 5, 1, 10, 30, 10), 2.12, "a"),
(dt.datetime(2021, 5, 1, 10, 30, 20), 2.13, "a"),
(dt.datetime(2021, 5, 1, 10, 30, 50), 2.14, "a"),
(dt.datetime(2021, 5, 1, 10, 31, 5), 2.13, "a"),
(dt.datetime(2021, 5, 1, 10, 31, 10), 2.16, "b"),
(dt.datetime(2021, 5, 1, 10, 31, 11), 2.17, "b"),
],
["ts", "value", "group"]
)
w = (
W
.partitionBy("group")
.orderBy("value")
)
df.select(
"*",
F.rank().over(w).alias("rank")
).show()
+-------------------+-----+-----+----+
| ts|value|group|rank|
+-------------------+-----+-----+----+
|2021-05-01 10:30:10| 2.12| a| 1|
|2021-05-01 10:30:20| 2.13| a| 2|
|2021-05-01 10:31:05| 2.13| a| 2|
|2021-05-01 10:30:50| 2.14| a| 4|
|2021-05-01 10:30:00| 2.15| a| 5|
|2021-05-01 10:31:10| 2.16| b| 1|
|2021-05-01 10:31:11| 2.17| b| 2|
+-------------------+-----+-----+----+

Descriptive statistics on values outside of group for each group

I have a Spark DataFrame like this:
edited: each name can appear multiple times, in any org.
df = sqlContext.createDataFrame(
[
('org_1', 'a', 1),
('org_1', 'a', 2),
('org_1', 'a', 3),
('org_1', 'b', 4),
('org_1', 'c', 5),
('org_2', 'a', 7),
('org_2', 'd', 4),
('org_2', 'e', 5),
('org_2', 'e', 10)
],
["org", "name", "value"]
)
I would like to calculate for each org and name: the mean, stddev and count of values from the rest of the names excluding that name within each org. E.g. For org_1, name b, mean = (1+2+3+5)/4
The DataFrame has ~450 million rows. I cannot use vectorized pandas_UDF because my Spark version is 2.2. There is also a constraint of spark.driver.maxResultSize of 4.0 GB.
I tried this on Pandas (filter rows within groups and take mean/std/count) on a DataFrame with only two columns (name and value). I haven't figured out how to do this with two levels of grouped columns (org and name).
def stats_fun(x):
return pd.Series({'data_mean': x['value'].mean(),
'data_std': x['value'].std(),
'data_n': x['value'].count(),
'anti_grp_mean': df[df['name'] != x.name]['value'].mean(),
'anti_grp_std': df[df['name'] != x.name]['value'].std(),
'anti_grp_n': df[df['name'] != x.name]['value'].count()
})
df.groupby('name').apply(stats_fun)
Is there a similar UDF function I can define on Spark? (This function would have to take in multiple columns). Otherwise, what is a more efficient way to do this?
A simple UDF can also work.
import pyspark.sql.functions as F
import numpy as np
from pyspark.sql.types import *
df = sql.createDataFrame(
[
('org_1', 'a', 1),
('org_1', 'a', 2),
('org_1', 'a', 3),
('org_1', 'b', 4),
('org_1', 'c', 5),
('org_2', 'a', 7),
('org_2', 'd', 4),
('org_2', 'e', 5),
('org_2', 'e', 10)
],
["org", "name", "value"]
)
+-----+----+-----+
| org|name|value|
+-----+----+-----+
|org_1| a| 1|
|org_1| a| 2|
|org_1| a| 3|
|org_1| b| 4|
|org_1| c| 5|
|org_2| a| 7|
|org_2| d| 4|
|org_2| e| 5|
|org_2| e| 10|
+-----+----+-----+
After applying groupby and collecting all elements in list, we apply udf to find statistics. After that, columns are exploded and split into multiple columns.
def _find_stats(a,b):
dict_ = zip(a,b)
stats = []
for name in a:
to_cal = [v for k,v in dict_ if k != name]
stats.append((name,float(np.mean(to_cal))\
,float(np.std(to_cal))\
,len(to_cal)))
print stats
return stats
find_stats = F.udf(_find_stats,ArrayType(ArrayType(StringType())))
cols = ['name', 'mean', 'stddev', 'count']
splits = [F.udf(lambda val:val[0],StringType()),\
F.udf(lambda val:val[1],StringType()),\
F.udf(lambda val:val[2],StringType()),\
F.udf(lambda val:val[3],StringType())]
df = df.groupby('org').agg(*[F.collect_list('name').alias('name'), F.collect_list('value').alias('value')])\
.withColumn('statistics', find_stats(F.col('name'), F.col('value')))\
.drop('name').drop('value')\
.select('org', F.explode('statistics').alias('statistics'))\
.select(['org']+[split_('statistics').alias(col_name) for split_,col_name in zip(splits,cols)])\
.dropDuplicates()
df.show()
+-----+----+-----------------+------------------+-----+
| org|name| mean| stddev|count|
+-----+----+-----------------+------------------+-----+
|org_1| c| 2.5| 1.118033988749895| 4|
|org_2| e| 5.5| 1.5| 2|
|org_2| a|6.333333333333333|2.6246692913372702| 3|
|org_2| d|7.333333333333333|2.0548046676563256| 3|
|org_1| a| 4.5| 0.5| 2|
|org_1| b| 2.75| 1.479019945774904| 4|
+-----+----+-----------------+------------------+-----+
If you also want 'value' column, you can insert that in the tuple in udf function and add one split udf.
Also, since there will be duplicates in the dataframe due to repetition of names, you can remove them using dropDuplicates.
Here is a way to do this using only DataFrame functions.
Just join your DataFrame to itself on the org column and use a where clause to specify that the name column should be different. Then we select the distinct rows of ('l.org', 'l.name', 'r.name', 'r.value') - essentially, we ignore the l.value column because we want to avoid double counting for the same (org, name) pair.
For example, this is how you could collect the other values for each ('org', 'name') pair:
import pyspark.sql.functions as f
df.alias('l').join(df.alias('r'), on='org')\
.where('l.name != r.name')\
.select('l.org', 'l.name', 'r.name', 'r.value')\
.distinct()\
.groupBy('l.org', 'l.name')\
.agg(f.collect_list('r.value').alias('other_values'))\
.show()
#+-----+----+------------+
#| org|name|other_values|
#+-----+----+------------+
#|org_1| a| [4, 5]|
#|org_1| b|[1, 2, 3, 5]|
#|org_1| c|[1, 2, 3, 4]|
#|org_2| a| [4, 5, 10]|
#|org_2| d| [7, 5, 10]|
#|org_2| e| [7, 4]|
#+-----+----+------------+
For the descriptive stats, you can use the mean, stddev, and count functions from pyspark.sql.functions:
df.alias('l').join(df.alias('r'), on='org')\
.where('l.name != r.name')\
.select('l.org', 'l.name', 'r.name', 'r.value')\
.distinct()\
.groupBy('l.org', 'l.name')\
.agg(
f.mean('r.value').alias('mean'),
f.stddev('r.value').alias('stddev'),
f.count('r.value').alias('count')
)\
.show()
#+-----+----+-----------------+------------------+-----+
#| org|name| mean| stddev|count|
#+-----+----+-----------------+------------------+-----+
#|org_1| a| 4.5|0.7071067811865476| 2|
#|org_1| b| 2.75| 1.707825127659933| 4|
#|org_1| c| 2.5|1.2909944487358056| 4|
#|org_2| a|6.333333333333333|3.2145502536643185| 3|
#|org_2| d|7.333333333333333|2.5166114784235836| 3|
#|org_2| e| 5.5|2.1213203435596424| 2|
#+-----+----+-----------------+------------------+-----+
Note that pyspark.sql.functions.stddev() returns the unbiased sample standard deviation. If you wanted the population standard deviation, use pyspark.sql.functions.stddev_pop():
df.alias('l').join(df.alias('r'), on='org')\
.where('l.name != r.name')\
.groupBy('l.org', 'l.name')\
.agg(
f.mean('r.value').alias('mean'),
f.stddev_pop('r.value').alias('stddev'),
f.count('r.value').alias('count')
)\
.show()
#+-----+----+-----------------+------------------+-----+
#| org|name| mean| stddev|count|
#+-----+----+-----------------+------------------+-----+
#|org_1| a| 4.5| 0.5| 2|
#|org_1| b| 2.75| 1.479019945774904| 4|
#|org_1| c| 2.5| 1.118033988749895| 4|
#|org_2| a|6.333333333333333|2.6246692913372702| 3|
#|org_2| d|7.333333333333333|2.0548046676563256| 3|
#|org_2| e| 5.5| 1.5| 2|
#+-----+----+-----------------+------------------+-----+
EDIT
As #NaomiHuang mentioned in the comments, you could also reduce l to the distinct org/name pairs before doing the join:
df.select('org', 'name')\
.distinct()\
.alias('l')\
.join(df.alias('r'), on='org')\
.where('l.name != r.name')\
.groupBy('l.org', 'l.name')\
.agg(f.collect_list('r.value').alias('other_values'))\
.show()
#+-----+----+------------+
#| org|name|other_values|
#+-----+----+------------+
#|org_1| a| [5, 4]|
#|org_1| b|[5, 1, 2, 3]|
#|org_1| c|[1, 2, 3, 4]|
#|org_2| a| [4, 5, 10]|
#|org_2| d| [7, 5, 10]|
#|org_2| e| [7, 4]|
#+-----+----+------------+

Resources