Sum of array elements depending on value condition pyspark - apache-spark

I have a pyspark dataframe:
id | column
------------------------------
1 | [0.2, 2, 3, 4, 3, 0.5]
------------------------------
2 | [7, 0.3, 0.3, 8, 2,]
------------------------------
I would like to create a 3 columns:
Column 1: contain the sum of the elements < 2
Column 2: contain the sum of the elements > 2
Column 3: contain the sum of the elements = 2 (some times I have duplicate values so I do their sum) In case if I don't have a
values I put null.
Expect result:
id | column | column<2 | column>2 | column=2
------------------------------|--------------------------------------------
1 | [0.2, 2, 3, 4, 3, 0.5]| [0.7] | [12] | null
---------------------------------------------------------------------------
2 | [7, 0.3, 0.3, 8, 2,] | [0.6] | [15] | [2]
---------------------------------------------------------------------------
Can you help me please ?
Thank you

For Spark 2.4+, you can use aggregate and filter higher-order functions like this:
df.withColumn("column<2", expr("aggregate(filter(column, x -> x < 2), 0D, (x, acc) -> acc + x)")) \
.withColumn("column>2", expr("aggregate(filter(column, x -> x > 2), 0D, (x, acc) -> acc + x)")) \
.withColumn("column=2", expr("aggregate(filter(column, x -> x == 2), 0D, (x, acc) -> acc + x)")) \
.show(truncate=False)
Gives:
+---+------------------------------+--------+--------+--------+
|id |column |column<2|column>2|column=2|
+---+------------------------------+--------+--------+--------+
|1 |[0.2, 2.0, 3.0, 4.0, 3.0, 0.5]|0.7 |10.0 |2.0 |
|2 |[7.0, 0.3, 0.3, 8.0, 2.0] |0.6 |15.0 |2.0 |
+---+------------------------------+--------+--------+--------+

Here's a way you can try:
import pyspark.sql.functions as F
# using map filter the list and count based on condition
s = (df
.select('column')
.rdd
.map(lambda x: [[i for i in x.column if i < 2],
[i for i in x.column if i > 2],
[i for i in x.column if i == 2]])
.map(lambda x: [Row(round(sum(i), 2)) for i in x]))
.toDF(['col<2','col>2','col=2'])
# create a dummy id so we can join both data frames
df = df.withColumn('mid', F.monotonically_increasing_id())
s = s.withColumn('mid', F.monotonically_increasing_id())
#simple left join
df = df.join(s, on='mid').drop('mid').show()
+---+--------------------+-----+------+-----+
| id| column|col<2| col>2|col=2|
+---+--------------------+-----+------+-----+
| 0|[0.2, 2.0, 3.0, 4...|[0.7]|[10.0]|[2.0]|
| 1|[7.0, 0.3, 0.3, 8...|[0.6]|[15.0]|[2.0]|
+---+--------------------+-----+------+-----+

For Spark 2.4+, you can use aggregate function and do the calculation in one step:
from pyspark.sql.functions import expr
# I adjusted the 2nd array-item in id=1 from 2.0 to 2.1 so there is no `2.0` when id=1
df = spark.createDataFrame([(1,[0.2, 2.1, 3., 4., 3., 0.5]),(2,[7., 0.3, 0.3, 8., 2.,])],['id','column'])
df.withColumn('data', expr("""
aggregate(
/* ArrayType argument */
column,
/* zero: set empty array to initialize acc */
array(),
/* merge: iterate through `column` and reduce based on the values of y and the array indices of acc */
(acc, y) ->
CASE
WHEN y < 2.0 THEN array(IFNULL(acc[0],0) + y, acc[1], acc[2])
WHEN y > 2.0 THEN array(acc[0], IFNULL(acc[1],0) + y, acc[2])
ELSE array(acc[0], acc[1], IFNULL(acc[2],0) + y)
END,
/* finish: to convert the array into a named_struct */
acc -> (acc[0] as `column<2`, acc[1] as `column>2`, acc[2] as `column=2`)
)
""")).selectExpr('id', 'data.*').show()
#+---+--------+--------+--------+
#| id|column<2|column>2|column=2|
#+---+--------+--------+--------+
#| 1| 0.7| 12.1| null|
#| 2| 0.6| 15.0| 2.0|
#+---+--------+--------+--------+
Before Spark 2.4, the functional-support for ArrayType is limited, you might do it with explode and then groupby+pivot:
from pyspark.sql.functions import sum as fsum, expr
df.selectExpr('id', 'explode_outer(column) as item') \
.withColumn('g', expr('if(item < 2, "column<2", if(item > 2, "column>2", "column=2"))')) \
.groupby('id') \
.pivot('g', ["column<2", "column>2", "column=2"]) \
.agg(fsum('item')) \
.show()
#+---+--------+--------+--------+
#| id|column<2|column>2|column=2|
#+---+--------+--------+--------+
#| 1| 0.7| 12.1| null|
#| 2| 0.6| 15.0| 2.0|
#+---+--------+--------+--------+
In case explode is slow (i.e. SPARK-21657 shown before Spark 2.3), use an UDF:
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, DoubleType
schema = StructType([
StructField("column>2", DoubleType()),
StructField("column<2", DoubleType()),
StructField("column=2", DoubleType())
])
def split_data(arr):
d = {}
if arr is None: arr = []
for y in arr:
if y > 2:
d['column>2'] = d.get('column>2',0) + y
elif y < 2:
d['column<2'] = d.get('column<2',0) + y
else:
d['column=2'] = d.get('column=2',0) + y
return d
udf_split_data = udf(split_data, schema)
df.withColumn('data', udf_split_data('column')).selectExpr('id', 'data.*').show()

Related

processing of a complex object (array) in pyspark

I am trying to figure out possible ways to process complex objects in pyspark. In the example below one of the columns of the dataframe is an array of integers. The processing is simply adding one to each value. Are these acceptable methods or there is a better practice?
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
spark = SparkSession.builder.enableHiveSupport().appName('learn').getOrCreate()
data = [('a', 1, [1, 3, 5]),
('b', 2, [4, 6, 9]),
('c', 3, [50, 60, 70, 80])]
df = spark.createDataFrame(data, ['nam', 'q', 'compl'])
# process complex object, method 1 using explode and collect_list (dataframe API)
res = df.withColumn('id', f.monotonically_increasing_id()).withColumn('compl_exploded', f.explode(f.col('compl')))
res = res.withColumn('compl_exploded', f.col('compl_exploded')+1)
res = res.groupby('id').agg(f.first('nam'), f.first('q'), f.collect_list('compl_exploded').alias('compl')).drop('id')
res.show()
# process complex object, method 2 using explode and collect_list (SQL)
df.withColumn('id', f.monotonically_increasing_id()).createOrReplaceTempView('tmp_view')
res = spark.sql("""
SELECT first(nam) AS nam, first(q) AS q, collect_list(compl_exploded+1) AS compl FROM (
SELECT *, explode(compl) AS compl_exploded FROM tmp_view
) x
GROUP BY id
""")
res.show()
# process complex object, method 3 using python UDF
from typing import List
def process(x: List[int]) -> List[int]:
return [_+1 for _ in x]
process_udf = f.udf(process, ArrayType(LongType()))
res = df.withColumn('compl', process_udf('compl'))
res.show()
For such operation you can take advantage of in build functions.
For e.g in your usecase you can use transform like below :
pyspark<=3.0
# Option 1
import pyspark.sql.functions as f
df.withColumn('add_one',f.expr('transform(compl, x -> x+1)')).show()
+---+---+----------------+----------------+
|nam| q| compl| add_one|
+---+---+----------------+----------------+
| a| 1| [1, 3, 5]| [2, 4, 6]|
| b| 2| [4, 6, 9]| [5, 7, 10]|
| c| 3|[50, 60, 70, 80]|[51, 61, 71, 81]|
+---+---+----------------+----------------+
# OR below options , all will give same output
# Option 2
df.select('nam', 'q', 'compl' , f.expr('transform(compl, x -> x+1) as add_one')).show()
# Option 3
df.createOrReplaceTempView('tmp_view')
spark.sql( 'select nam, q, compl , transform(compl, x -> x+1) as add_one from tmp_view').show()
pyspark>=3.1.0
If you are using newer version of spark then this function is easily available and you can use without expr.

Append a value after every element in PySpark list Dataframe

I am having a dataframe like this
Data ID
[1,2,3,4] 22
I want to create a new column and each and every entry in the new column will be value from Data field appended with ID by ~ symbol, like below
Data ID New_Column
[1,2,3,4] 22 [1|22~2|22~3|22~4|22]
Note : In Data field the array size is not fixed one. It may not have entry or N number of entry will be there.
Can anyone please help me to solve!
package spark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
object DF extends App {
val spark = SparkSession.builder()
.master("local")
.appName("DataFrame-example")
.getOrCreate()
import spark.implicits._
val df = Seq(
(22, Seq(1,2,3,4)),
(23, Seq(1,2,3,4,5,6,7,8)),
(24, Seq())
).toDF("ID", "Data")
val arrUDF = udf((id: Long, array: Seq[Long]) => {
val r = array.size match {
case 0 => ""
case _ => array.map(x => s"$x|$id").mkString("~")
}
s"[$r]"
})
val resDF = df.withColumn("New_Column", lit(arrUDF('ID, 'Data)))
resDF.show(false)
//+---+------------------------+-----------------------------------------+
//|ID |Data |New_Column |
//+---+------------------------+-----------------------------------------+
//|22 |[1, 2, 3, 4] |[1|22~2|22~3|22~4|22] |
//|23 |[1, 2, 3, 4, 5, 6, 7, 8]|[1|23~2|23~3|23~4|23~5|23~6|23~7|23~8|23]|
//|24 |[] |[] |
//+---+------------------------+-----------------------------------------+
}
Spark 2.4+
Pyspark equivalent for the same goes like
df = spark.createDataFrame([(22, [1,2,3,4]),(23, [1,2,3,4,5,6,7,8]),(24, [])],['Id','Data'])
df.show()
+---+--------------------+
| Id| Data|
+---+--------------------+
| 22| [1, 2, 3, 4]|
| 23|[1, 2, 3, 4, 5, 6...|
| 24| []|
+---+--------------------+
df.withColumn('ff', f.when(f.size('Data')==0,'').otherwise(f.expr('''concat_ws('~',transform(Data, x->concat(x,'|',Id)))'''))).show(20,False)
+---+------------------------+---------------------------------------+
|Id |Data |ff |
+---+------------------------+---------------------------------------+
|22 |[1, 2, 3, 4] |1|22~2|22~3|22~4|22 |
|23 |[1, 2, 3, 4, 5, 6, 7, 8]|1|23~2|23~3|23~4|23~5|23~6|23~7|23~8|23|
|24 |[] | |
+---+------------------------+---------------------------------------+
If you want final output as array
df.withColumn('ff',f.array(f.when(f.size('Data')==0,'').otherwise(f.expr('''concat_ws('~',transform(Data, x->concat(x,'|',Id)))''')))).show(20,False)
+---+------------------------+-----------------------------------------+
|Id |Data |ff |
+---+------------------------+-----------------------------------------+
|22 |[1, 2, 3, 4] |[1|22~2|22~3|22~4|22] |
|23 |[1, 2, 3, 4, 5, 6, 7, 8]|[1|23~2|23~3|23~4|23~5|23~6|23~7|23~8|23]|
|24 |[] |[] |
+---+------------------------+-----------------------------------------+
Hope this helps
A udf can help:
def func(array, suffix):
return '~'.join([str(x) + '|' + str(suffix) for x in array])
from pyspark.sql.types import StringType
from pyspark.sql import functions as F
my_udf = F.udf(func, StringType())
df.withColumn("New_Column", my_udf("Data", "ID")).show()
prints
+------------+---+-------------------+
| Data| ID| New_Column |
+------------+---+-------------------+
|[1, 2, 3, 4]| 22|22~1|22~2|22~3|22~4|
+------------+---+-------------------+

How to split an array into chunks and find the sum of the chunks and store the output as an array in pyspark

I have a dataframe as shown below:
+-----+------------------------+
|Index| finalArray |
+-----+------------------------+
|1 |[0, 2, 0, 3, 1, 4, 2, 7]|
|2 |[0, 4, 4, 3, 4, 2, 2, 5]|
+-----+------------------------+
I want to break the array into chunks of 2 and then find the sum of each chunks and store the resultant array in the column finalArray. It will look like below:
+-----+---------------------+
|Index| finalArray |
+-----+---------------------+
|1 |[2, 3, 5, 9] |
|2 |[4, 7, 6, 7] |
+-----+---------------------+
I am able to do it by creating an UDF but looking for an better and optimised way. Preferably if I can handle it using a withColumn and passing flagArray to do it without having to write an UDF.
#udf(ArrayType(DoubleType()))
def aggregate(finalArray,chunkSize):
n = int(chunkSize)
aggsum = []
final = [finalArray[i * n:(i + 1) * n] for i in range((len(finalArray) + n - 1) // n )]
for item in final:
agg = 0
for j in item:
agg += j
aggsum.append(agg)
return aggsum
I am not able to use the below expression in UDF hence I used loops
[sum(finalArray[x:x+2]) for x in range(0, len(finalArray), chunkSize)]
For spark 2.4+, you can try sequence + transform:
from pyspark.sql.function import expr
df = spark.createDataFrame([
(1, [0, 2, 0, 3, 1, 4, 2, 7]),
(2, [0, 4, 4, 3, 4, 2, 2, 5])
], ["Index", "finalArray"])
df.withColumn("finalArray", expr("""
transform(
sequence(0,ceil(size(finalArray)/2)-1),
i -> finalArray[2*i] + ifnull(finalArray[2*i+1],0))
""")).show(truncate=False)
+-----+------------+
|Index|finalArray |
+-----+------------+
|1 |[2, 3, 5, 9]|
|2 |[4, 7, 6, 7]|
+-----+------------+
For a chunk-size of any N, use aggregate function to do the sub-totals:
N = 3
sql_expr = """
transform(
/* create a sequence from 0 to number_of_chunks-1 */
sequence(0,ceil(size(finalArray)/{0})-1),
/* iterate the above sequence */
i ->
/* create a sequence from 0 to chunk_size-1
calculate the sum of values containing every chunk_size items by their indices
*/
aggregate(
sequence(0,{0}-1),
0L,
(acc, y) -> acc + ifnull(finalArray[i*{0}+y],0)
)
)
"""
df.withColumn("finalArray", expr(sql_expr.format(N))).show()
+-----+----------+
|Index|finalArray|
+-----+----------+
| 1| [2, 8, 9]|
| 2| [8, 9, 7]|
+-----+----------+
Here is a slightly different version of #jxc's solution using slice function with transform and aggregate functions.
The logic is for each element of the array we check if its index is a multiple of chunk size and use slice to get a subarray of chunk size. With aggregate we sum the elements of each sub-array. Finally using filter to remove nulls (corresponding to indexes that do not satisfy i % chunk = 0.
chunk = 2
transform_expr = f"""
filter(transform(finalArray,
(x, i) -> IF (i % {chunk} = 0,
aggregate(slice(finalArray, i+1, {chunk}), 0L, (acc, y) -> acc + y),
null
)
),
x -> x is not null)
"""
df.withColumn("finalArray", expr(transform_expr)).show()
#+-----+------------+
#|Index| finalArray|
#+-----+------------+
#| 1|[2, 3, 5, 9]|
#| 2|[4, 7, 6, 7]|
#+-----+------------+

Pyspark UDF to return result similar to groupby().sum() between two columns

I have the following sample dataframe
fruit_list = ['apple', 'apple', 'orange', 'apple']
qty_list = [16, 2, 3, 1]
spark_df = spark.createDataFrame([(101, 'Mark', fruit_list, qty_list)], ['ID', 'name', 'fruit', 'qty'])
and I would like to create another column which contains a result similar to what I would achieve with a pandas groupby('fruit').sum()
qty
fruits
apple 19
orange 3
The above result could be stored in the new column in any form (either a string, dictionary, list of tuples...).
I've tried an approach similar to the following one which does not work
sum_cols = udf(lambda x: pd.DataFrame({'fruits': x[0], 'qty': x[1]}).groupby('fruits').sum())
spark_df.withColumn('Result', sum_cols(F.struct('fruit', 'qty'))).show()
One example of result dataframe could be
+---+----+--------------------+-------------+-------------------------+
| ID|name| fruit| qty| Result|
+---+----+--------------------+-------------+-------------------------+
|101|Mark|[apple, apple, or...|[16, 2, 3, 1]|[(apple,19), (orange,3)] |
+---+----+--------------------+-------------+-------------------------+
Do you have any suggestion on how I could achieve that?
Thanks
Edit: running on Spark 2.4.3
As #pault mentioned, as of Spark 2.4+, you can use Spark SQL built-in function to handle your task, here is one way with array_distinct + transform + aggregate:
from pyspark.sql.functions import expr
# set up data
spark_df = spark.createDataFrame([
(101, 'Mark', ['apple', 'apple', 'orange', 'apple'], [16, 2, 3, 1])
, (102, 'Twin', ['apple', 'banana', 'avocado', 'banana', 'avocado'], [5, 2, 11, 3, 1])
, (103, 'Smith', ['avocado'], [10])
], ['ID', 'name', 'fruit', 'qty']
)
>>> spark_df.show(5,0)
+---+-----+-----------------------------------------+----------------+
|ID |name |fruit |qty |
+---+-----+-----------------------------------------+----------------+
|101|Mark |[apple, apple, orange, apple] |[16, 2, 3, 1] |
|102|Twin |[apple, banana, avocado, banana, avocado]|[5, 2, 11, 3, 1]|
|103|Smith|[avocado] |[10] |
+---+-----+-----------------------------------------+----------------+
>>> spark_df.printSchema()
root
|-- ID: long (nullable = true)
|-- name: string (nullable = true)
|-- fruit: array (nullable = true)
| |-- element: string (containsNull = true)
|-- qty: array (nullable = true)
| |-- element: long (containsNull = true)
Set up the SQL statement:
stmt = '''
transform(array_distinct(fruit), x -> (x, aggregate(
transform(sequence(0,size(fruit)-1), i -> IF(fruit[i] = x, qty[i], 0))
, 0
, (y,z) -> int(y + z)
))) AS sum_fruit
'''
>>> spark_df.withColumn('sum_fruit', expr(stmt)).show(10,0)
+---+-----+-----------------------------------------+----------------+----------------------------------------+
|ID |name |fruit |qty |sum_fruit |
+---+-----+-----------------------------------------+----------------+----------------------------------------+
|101|Mark |[apple, apple, orange, apple] |[16, 2, 3, 1] |[[apple, 19], [orange, 3]] |
|102|Twin |[apple, banana, avocado, banana, avocado]|[5, 2, 11, 3, 1]|[[apple, 5], [banana, 5], [avocado, 12]]|
|103|Smith|[avocado] |[10] |[[avocado, 10]] |
+---+-----+-----------------------------------------+----------------+----------------------------------------+
Explanation:
Use array_distinct(fruit) to find all distinct entries in the array fruit
transform this new array (with element x) from x to (x, aggregate(..x..))
the above function aggregate(..x..) takes the simple form of summing up all elements in array_T
aggregate(array_T, 0, (y,z) -> y + z)
where the array_T is from the following transformation:
transform(sequence(0,size(fruit)-1), i -> IF(fruit[i] = x, qty[i], 0))
which iterate through the array fruit, if the value of fruit[i] = x , then return the corresponding qty[i], otherwise return 0. for example for ID=101, when x = 'orange', it returns an array [0, 0, 3, 0]
There may be a fancy way to do this using only the API functions on Spark 2.4+, perhaps with some combination of arrays_zip and aggregate, but I can't think of any that don't involve an explode step followed by a groupBy. With that in mind, using a udf may actually be better for you in this case.
I think creating a pandas DataFrame just for the purpose of calling .groupby().sum() is overkill. Furthermore, even if you did do it that way, you'd need to convert the final output to a different data structure because a udf can't return a pandas DataFrame.
Here's one way with a udf using collections.defaultdict:
from collections import defaultdict
from pyspark.sql.functions import udf
def sum_cols_func(frt, qty):
d = defaultdict(int)
for x, y in zip(frt, map(int, qty)):
d[x] += y
return d.items()
sum_cols = udf(
lambda x: sum_cols_func(*x),
ArrayType(
StructType([StructField("fruit", StringType()), StructField("qty", IntegerType())])
)
)
Then call this by passing in the fruit and qty columns:
from pyspark.sql.functions import array, col
spark_df.withColumn(
"Result",
sum_cols(array([col("fruit"), col("qty")]))
).show(truncate=False)
#+---+----+-----------------------------+-------------+--------------------------+
#|ID |name|fruit |qty |Result |
#+---+----+-----------------------------+-------------+--------------------------+
#|101|Mark|[apple, apple, orange, apple]|[16, 2, 3, 1]|[[orange, 3], [apple, 19]]|
#+---+----+-----------------------------+-------------+--------------------------+
If you have spark < 2.4, use the follwoing to explode (otherwise check this answer):
df_split = (spark_df.rdd.flatMap(lambda row: [(row.ID, row.name, f, q) for f, q in zip(row.fruit, row.qty)]).toDF(["ID", "name", "fruit", "qty"]))
df_split.show()
Output:
+---+----+------+---+
| ID|name| fruit|qty|
+---+----+------+---+
|101|Mark| apple| 16|
|101|Mark| apple| 2|
|101|Mark|orange| 3|
|101|Mark| apple| 1|
+---+----+------+---+
Then prepare the result you want. First find the aggregated dataframe:
df_aggregated = df_split.groupby('ID', 'fruit').agg(F.sum('qty').alias('qty'))
df_aggregated.show()
Output:
+---+------+---+
| ID| fruit|qty|
+---+------+---+
|101|orange| 3|
|101| apple| 19|
+---+------+---+
And finally change it to the desired format:
df_aggregated.groupby('ID').agg(F.collect_list(F.struct(F.col('fruit'), F.col('qty'))).alias('Result')).show()
Output:
+---+--------------------------+
|ID |Result |
+---+--------------------------+
|101|[[orange, 3], [apple, 19]]|
+---+--------------------------+

Counter function on a ArrayColumn Pyspark

From this data frame
+-----+-----------------+
|store| values |
+-----+-----------------+
| 1|[1, 2, 3,4, 5, 6]|
| 2| [2,3]|
+-----+-----------------+
I would like to apply the Counter function to get this:
+-----+------------------------------+
|store| values |
+-----+------------------------------+
| 1|{1:1, 2:1, 3:1, 4:1, 5:1, 6:1}|
| 2|{2:1, 3:1} |
+-----+------------------------------+
I got this data frame using the answer of another question :
GroupBy and concat array columns pyspark
So I try to modify the code that is in the answers like this:
Option 1:
def flatten_counter(val):
return Counter(reduce (lambda x, y:x+y, val))
udf_flatten_counter = sf.udf(flatten_counter, ty.ArrayType(ty.IntegerType()))
df3 = df2.select("store", flatten_counter("values2").alias("values3"))
df3.show(truncate=False)
Option 2:
df.rdd.map(lambda r: (r.store, r.values)).reduceByKey(lambda x, y: x + y).map(lambda row: Counter(row[1])).toDF(['store', 'values']).show()
but it doesn't work.
Does anybody know how can I do it?
Thank you
You just have to provide correct data type
udf_flatten_counter = sf.udf(
lambda x: dict(Counter(x)),
ty.MapType(ty.IntegerType(), ty.IntegerType()))
df = spark.createDataFrame(
[(1, [1, 2, 3, 4, 5, 6]), (2, [2, 3])], ("store", "values"))
df.withColumn("cnt", udf_flatten_counter("values")).show(2, False)
# +-----+------------------+---------------------------------------------------+
# |store|values |cnt |
# +-----+------------------+---------------------------------------------------+
# |1 |[1, 2, 3, 4, 5, 6]|Map(5 -> 1, 1 -> 1, 6 -> 1, 2 -> 1, 3 -> 1, 4 -> 1)|
# |2 |[2, 3] |Map(2 -> 1, 3 -> 1) |
# +-----+------------------+---------------------------------------------------+
Similarly with RDD
df.rdd.mapValues(Counter).mapValues(dict).toDF(["store", "values"]).show(2, False)
# +-----+---------------------------------------------------+
# |store|values |
# +-----+---------------------------------------------------+
# |1 |Map(5 -> 1, 1 -> 1, 6 -> 1, 2 -> 1, 3 -> 1, 4 -> 1)|
# |2 |Map(2 -> 1, 3 -> 1) |
# +-----+---------------------------------------------------+
Conversion to dict is necessary because apparently Pyrolite cannot handle Counter objects.

Resources