From this data frame
+-----+-----------------+
|store| values |
+-----+-----------------+
| 1|[1, 2, 3,4, 5, 6]|
| 2| [2,3]|
+-----+-----------------+
I would like to apply the Counter function to get this:
+-----+------------------------------+
|store| values |
+-----+------------------------------+
| 1|{1:1, 2:1, 3:1, 4:1, 5:1, 6:1}|
| 2|{2:1, 3:1} |
+-----+------------------------------+
I got this data frame using the answer of another question :
GroupBy and concat array columns pyspark
So I try to modify the code that is in the answers like this:
Option 1:
def flatten_counter(val):
return Counter(reduce (lambda x, y:x+y, val))
udf_flatten_counter = sf.udf(flatten_counter, ty.ArrayType(ty.IntegerType()))
df3 = df2.select("store", flatten_counter("values2").alias("values3"))
df3.show(truncate=False)
Option 2:
df.rdd.map(lambda r: (r.store, r.values)).reduceByKey(lambda x, y: x + y).map(lambda row: Counter(row[1])).toDF(['store', 'values']).show()
but it doesn't work.
Does anybody know how can I do it?
Thank you
You just have to provide correct data type
udf_flatten_counter = sf.udf(
lambda x: dict(Counter(x)),
ty.MapType(ty.IntegerType(), ty.IntegerType()))
df = spark.createDataFrame(
[(1, [1, 2, 3, 4, 5, 6]), (2, [2, 3])], ("store", "values"))
df.withColumn("cnt", udf_flatten_counter("values")).show(2, False)
# +-----+------------------+---------------------------------------------------+
# |store|values |cnt |
# +-----+------------------+---------------------------------------------------+
# |1 |[1, 2, 3, 4, 5, 6]|Map(5 -> 1, 1 -> 1, 6 -> 1, 2 -> 1, 3 -> 1, 4 -> 1)|
# |2 |[2, 3] |Map(2 -> 1, 3 -> 1) |
# +-----+------------------+---------------------------------------------------+
Similarly with RDD
df.rdd.mapValues(Counter).mapValues(dict).toDF(["store", "values"]).show(2, False)
# +-----+---------------------------------------------------+
# |store|values |
# +-----+---------------------------------------------------+
# |1 |Map(5 -> 1, 1 -> 1, 6 -> 1, 2 -> 1, 3 -> 1, 4 -> 1)|
# |2 |Map(2 -> 1, 3 -> 1) |
# +-----+---------------------------------------------------+
Conversion to dict is necessary because apparently Pyrolite cannot handle Counter objects.
Related
Suppose you create a Spark DataFrame with a precise schema:
import pyspark.sql.functions as sf
from pyspark.sql.types import *
dfschema = StructType([
StructField("_1", ArrayType(IntegerType())),
StructField("_2", ArrayType(IntegerType())),
])
df = spark.createDataFrame([[[1, 2, 5], [13, 74, 1]],
[[1, 2, 3], [77, 23, 15]]
], schema=dfschema)
df = df.select(sf.map_from_arrays("_1", "_2").alias("omap"))
df = df.withColumn("id", sf.lit(1))
The above DataFrame looks like this:
+---------------------------+---+
|omap |id |
+---------------------------+---+
|{1 -> 13, 2 -> 74, 5 -> 1} |1 |
|{1 -> 77, 2 -> 23, 3 -> 15}|1 |
+---------------------------+---+
I would like to perform the following operation:
df.groupby("id").agg(sum_counter("omap")).show(truncate=False)
Could you please help me in defining a sum_counter function which uses only SQL functions from pyspark.sql.functions (so no UDFs) that allows me to obtain in output such a DataFrame:
+---+-----------------------------------+
|id |mapsum |
+---+-----------------------------------+
|1 |{1 -> 90, 2 -> 97, 5 -> 1, 3 -> 15}|
+---+-----------------------------------+
I could solve this using applyInPandas:
from pyspark.sql.types import *
from collections import Counter
import pandas as pd
reschema = StructType([
StructField("id", LongType()),
StructField("mapsum", MapType(IntegerType(), IntegerType()))
])
def sum_counter(key: int, pdf: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame([
key
+ (sum([Counter(x) for x in pdf["omap"]], Counter()), )
])
df.groupby("id").applyInPandas(sum_counter, reschema).show(truncate=False)
+---+-----------------------------------+
|id |mapsum |
+---+-----------------------------------+
|1 |{1 -> 90, 2 -> 97, 5 -> 1, 3 -> 15}|
+---+-----------------------------------+
However, for performance reasons, I would like to avoid using applyInPandas or UDFs. Any ideas?
You can first explode the omap to individual rows, where key and value will be set in separate columns, and then aggregate them like so:
exploded_df = df.select("*", sf.explode("omap"))
agg_df = exploded_df.groupBy("id", "key").sum("value")
agg_df.groupBy("id").agg(sf.map_from_entries(sf.collect_list(sf.struct("key","sum(value)"))).alias("mapsum")).show(truncate=False)
+---+-----------------------------------+
|id |mapsum |
+---+-----------------------------------+
|1 |{2 -> 97, 1 -> 90, 5 -> 1, 3 -> 15}|
+---+-----------------------------------+
In the end I solved it like this:
import pyspark.sql.functions as sf
def sum_counter(mapcoln: str):
dkeys = sf.array_distinct(sf.flatten(sf.collect_list(sf.map_keys(mapcoln))))
dkeyscount = sf.transform(
dkeys,
lambda ukey: sf.aggregate(
sf.collect_list(mapcoln),
sf.lit(0),
lambda acc, mapentry: sf.when(
~sf.isnull(sf.element_at(mapentry, ukey)),
acc + sf.element_at(mapentry, ukey),
).otherwise(acc),
),
)
return sf.map_from_arrays(dkeys, dkeyscount).alias("mapsum")
df.groupby("id").agg(sum_counter("omap")).show(truncate=False)
+---+-----------------------------------+
|id |mapsum |
+---+-----------------------------------+
|1 |{1 -> 90, 2 -> 97, 5 -> 1, 3 -> 15}|
+---+-----------------------------------+
I have dataframe:
d1 = [({'the town': 1, 'County Council s': 2, 'email':5},2),
({'Mayor': 2, 'Indiana': 2}, 4),
({'Congress': 2, 'Justice': 2,'country': 2, 'veterans':1},6)
]
df1 = spark.createDataFrame(d1, ['dct', 'count'])
df1.show()
ignore_lst = ['County Council s', 'emal','Indiana']
filter_lst = ['Congress','town','Mayor', 'Indiana']
I want to write two functions:
first function filters keys for the dct column that are not in the ignore_list and the second function filters if the keys are in filter_lst
Thus there will be two columns that contain dictionaries with keys filtered by ignore_list and filter_lst
These two UDFs should be sufficient for your case:
from pyspark.sql.functions import col
d1 = [({'the town': 1, 'County Council s': 2, 'email':5},2),
({'Mayor': 2, 'Indiana': 2}, 4),
({'Congress': 2, 'Justice': 2,'country': 2, 'veterans':1},6)
]
ignore_lst = ['County Council s', 'emal','Indiana']
filter_lst = ['Congress','town','Mayor', 'Indiana']
df1 = spark.createDataFrame(d1, ['dct', 'count'])
#udf
def apply_ignore_lst(dct):
return {k:v for k, v in dct.items() if k not in ignore_lst}
#udf
def apply_filter_lst(dct):
return {k:v for k, v in dct.items() if k in filter_lst}
df1.withColumn("apply_ignore_lst", apply_ignore_lst(col("dct"))).withColumn("apply_filter_lst", apply_filter_lst(col("apply_ignore_lst"))).show(truncate=False)
+----------------------------------------------------------+-----+----------------------------------------------+----------------+
|dct |count|apply_ignore_lst |apply_filter_lst|
+----------------------------------------------------------+-----+----------------------------------------------+----------------+
|{the town -> 1, County Council s -> 2, email -> 5} |2 |{the town=1, email=5} |{} |
|{Indiana -> 2, Mayor -> 2} |4 |{Mayor=2} |{Mayor=2} |
|{Justice -> 2, Congress -> 2, country -> 2, veterans -> 1}|6 |{Congress=2, Justice=2, country=2, veterans=1}|{Congress=2} |
+----------------------------------------------------------+-----+----------------------------------------------+----------------+
It can be done in one-liner using map_filter:
df1 \
.withColumn("ignored", F.map_filter("dct", lambda k, _: ~k.isin(ignore_lst))) \
.withColumn("filtered", F.map_filter("dct", lambda k, _: k.isin(filter_lst)))
Full example:
d1 = [({'the town': 1, 'County Council s': 2, 'email':5},2),
({'Mayor': 2, 'Indiana': 2}, 4),
({'Congress': 2, 'Justice': 2,'country': 2, 'veterans':1},6)
]
df1 = spark.createDataFrame(d1, ['dct', 'count'])
ignore_lst = ['County Council s', 'emal', 'Indiana']
filter_lst = ['Congress', 'town', 'Mayor', 'Indiana']
df1 = df1 \
.withColumn("ignored", F.map_filter("dct", lambda k, _: ~k.isin(ignore_lst))) \
.withColumn("filtered", F.map_filter("dct", lambda k, _: k.isin(filter_lst)))
[Out]:
+----------------------------------------------------------+--------------------------+
|ignored |filtered |
+----------------------------------------------------------+--------------------------+
|{the town -> 1, email -> 5} |{} |
|{Mayor -> 2} |{Indiana -> 2, Mayor -> 2}|
|{Justice -> 2, Congress -> 2, country -> 2, veterans -> 1}|{Congress -> 2} |
+----------------------------------------------------------+--------------------------+
I am having a dataframe like this
Data ID
[1,2,3,4] 22
I want to create a new column and each and every entry in the new column will be value from Data field appended with ID by ~ symbol, like below
Data ID New_Column
[1,2,3,4] 22 [1|22~2|22~3|22~4|22]
Note : In Data field the array size is not fixed one. It may not have entry or N number of entry will be there.
Can anyone please help me to solve!
package spark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
object DF extends App {
val spark = SparkSession.builder()
.master("local")
.appName("DataFrame-example")
.getOrCreate()
import spark.implicits._
val df = Seq(
(22, Seq(1,2,3,4)),
(23, Seq(1,2,3,4,5,6,7,8)),
(24, Seq())
).toDF("ID", "Data")
val arrUDF = udf((id: Long, array: Seq[Long]) => {
val r = array.size match {
case 0 => ""
case _ => array.map(x => s"$x|$id").mkString("~")
}
s"[$r]"
})
val resDF = df.withColumn("New_Column", lit(arrUDF('ID, 'Data)))
resDF.show(false)
//+---+------------------------+-----------------------------------------+
//|ID |Data |New_Column |
//+---+------------------------+-----------------------------------------+
//|22 |[1, 2, 3, 4] |[1|22~2|22~3|22~4|22] |
//|23 |[1, 2, 3, 4, 5, 6, 7, 8]|[1|23~2|23~3|23~4|23~5|23~6|23~7|23~8|23]|
//|24 |[] |[] |
//+---+------------------------+-----------------------------------------+
}
Spark 2.4+
Pyspark equivalent for the same goes like
df = spark.createDataFrame([(22, [1,2,3,4]),(23, [1,2,3,4,5,6,7,8]),(24, [])],['Id','Data'])
df.show()
+---+--------------------+
| Id| Data|
+---+--------------------+
| 22| [1, 2, 3, 4]|
| 23|[1, 2, 3, 4, 5, 6...|
| 24| []|
+---+--------------------+
df.withColumn('ff', f.when(f.size('Data')==0,'').otherwise(f.expr('''concat_ws('~',transform(Data, x->concat(x,'|',Id)))'''))).show(20,False)
+---+------------------------+---------------------------------------+
|Id |Data |ff |
+---+------------------------+---------------------------------------+
|22 |[1, 2, 3, 4] |1|22~2|22~3|22~4|22 |
|23 |[1, 2, 3, 4, 5, 6, 7, 8]|1|23~2|23~3|23~4|23~5|23~6|23~7|23~8|23|
|24 |[] | |
+---+------------------------+---------------------------------------+
If you want final output as array
df.withColumn('ff',f.array(f.when(f.size('Data')==0,'').otherwise(f.expr('''concat_ws('~',transform(Data, x->concat(x,'|',Id)))''')))).show(20,False)
+---+------------------------+-----------------------------------------+
|Id |Data |ff |
+---+------------------------+-----------------------------------------+
|22 |[1, 2, 3, 4] |[1|22~2|22~3|22~4|22] |
|23 |[1, 2, 3, 4, 5, 6, 7, 8]|[1|23~2|23~3|23~4|23~5|23~6|23~7|23~8|23]|
|24 |[] |[] |
+---+------------------------+-----------------------------------------+
Hope this helps
A udf can help:
def func(array, suffix):
return '~'.join([str(x) + '|' + str(suffix) for x in array])
from pyspark.sql.types import StringType
from pyspark.sql import functions as F
my_udf = F.udf(func, StringType())
df.withColumn("New_Column", my_udf("Data", "ID")).show()
prints
+------------+---+-------------------+
| Data| ID| New_Column |
+------------+---+-------------------+
|[1, 2, 3, 4]| 22|22~1|22~2|22~3|22~4|
+------------+---+-------------------+
I have a dataframe as shown below:
+-----+------------------------+
|Index| finalArray |
+-----+------------------------+
|1 |[0, 2, 0, 3, 1, 4, 2, 7]|
|2 |[0, 4, 4, 3, 4, 2, 2, 5]|
+-----+------------------------+
I want to break the array into chunks of 2 and then find the sum of each chunks and store the resultant array in the column finalArray. It will look like below:
+-----+---------------------+
|Index| finalArray |
+-----+---------------------+
|1 |[2, 3, 5, 9] |
|2 |[4, 7, 6, 7] |
+-----+---------------------+
I am able to do it by creating an UDF but looking for an better and optimised way. Preferably if I can handle it using a withColumn and passing flagArray to do it without having to write an UDF.
#udf(ArrayType(DoubleType()))
def aggregate(finalArray,chunkSize):
n = int(chunkSize)
aggsum = []
final = [finalArray[i * n:(i + 1) * n] for i in range((len(finalArray) + n - 1) // n )]
for item in final:
agg = 0
for j in item:
agg += j
aggsum.append(agg)
return aggsum
I am not able to use the below expression in UDF hence I used loops
[sum(finalArray[x:x+2]) for x in range(0, len(finalArray), chunkSize)]
For spark 2.4+, you can try sequence + transform:
from pyspark.sql.function import expr
df = spark.createDataFrame([
(1, [0, 2, 0, 3, 1, 4, 2, 7]),
(2, [0, 4, 4, 3, 4, 2, 2, 5])
], ["Index", "finalArray"])
df.withColumn("finalArray", expr("""
transform(
sequence(0,ceil(size(finalArray)/2)-1),
i -> finalArray[2*i] + ifnull(finalArray[2*i+1],0))
""")).show(truncate=False)
+-----+------------+
|Index|finalArray |
+-----+------------+
|1 |[2, 3, 5, 9]|
|2 |[4, 7, 6, 7]|
+-----+------------+
For a chunk-size of any N, use aggregate function to do the sub-totals:
N = 3
sql_expr = """
transform(
/* create a sequence from 0 to number_of_chunks-1 */
sequence(0,ceil(size(finalArray)/{0})-1),
/* iterate the above sequence */
i ->
/* create a sequence from 0 to chunk_size-1
calculate the sum of values containing every chunk_size items by their indices
*/
aggregate(
sequence(0,{0}-1),
0L,
(acc, y) -> acc + ifnull(finalArray[i*{0}+y],0)
)
)
"""
df.withColumn("finalArray", expr(sql_expr.format(N))).show()
+-----+----------+
|Index|finalArray|
+-----+----------+
| 1| [2, 8, 9]|
| 2| [8, 9, 7]|
+-----+----------+
Here is a slightly different version of #jxc's solution using slice function with transform and aggregate functions.
The logic is for each element of the array we check if its index is a multiple of chunk size and use slice to get a subarray of chunk size. With aggregate we sum the elements of each sub-array. Finally using filter to remove nulls (corresponding to indexes that do not satisfy i % chunk = 0.
chunk = 2
transform_expr = f"""
filter(transform(finalArray,
(x, i) -> IF (i % {chunk} = 0,
aggregate(slice(finalArray, i+1, {chunk}), 0L, (acc, y) -> acc + y),
null
)
),
x -> x is not null)
"""
df.withColumn("finalArray", expr(transform_expr)).show()
#+-----+------------+
#|Index| finalArray|
#+-----+------------+
#| 1|[2, 3, 5, 9]|
#| 2|[4, 7, 6, 7]|
#+-----+------------+
I have a pyspark dataframe:
id | column
------------------------------
1 | [0.2, 2, 3, 4, 3, 0.5]
------------------------------
2 | [7, 0.3, 0.3, 8, 2,]
------------------------------
I would like to create a 3 columns:
Column 1: contain the sum of the elements < 2
Column 2: contain the sum of the elements > 2
Column 3: contain the sum of the elements = 2 (some times I have duplicate values so I do their sum) In case if I don't have a
values I put null.
Expect result:
id | column | column<2 | column>2 | column=2
------------------------------|--------------------------------------------
1 | [0.2, 2, 3, 4, 3, 0.5]| [0.7] | [12] | null
---------------------------------------------------------------------------
2 | [7, 0.3, 0.3, 8, 2,] | [0.6] | [15] | [2]
---------------------------------------------------------------------------
Can you help me please ?
Thank you
For Spark 2.4+, you can use aggregate and filter higher-order functions like this:
df.withColumn("column<2", expr("aggregate(filter(column, x -> x < 2), 0D, (x, acc) -> acc + x)")) \
.withColumn("column>2", expr("aggregate(filter(column, x -> x > 2), 0D, (x, acc) -> acc + x)")) \
.withColumn("column=2", expr("aggregate(filter(column, x -> x == 2), 0D, (x, acc) -> acc + x)")) \
.show(truncate=False)
Gives:
+---+------------------------------+--------+--------+--------+
|id |column |column<2|column>2|column=2|
+---+------------------------------+--------+--------+--------+
|1 |[0.2, 2.0, 3.0, 4.0, 3.0, 0.5]|0.7 |10.0 |2.0 |
|2 |[7.0, 0.3, 0.3, 8.0, 2.0] |0.6 |15.0 |2.0 |
+---+------------------------------+--------+--------+--------+
Here's a way you can try:
import pyspark.sql.functions as F
# using map filter the list and count based on condition
s = (df
.select('column')
.rdd
.map(lambda x: [[i for i in x.column if i < 2],
[i for i in x.column if i > 2],
[i for i in x.column if i == 2]])
.map(lambda x: [Row(round(sum(i), 2)) for i in x]))
.toDF(['col<2','col>2','col=2'])
# create a dummy id so we can join both data frames
df = df.withColumn('mid', F.monotonically_increasing_id())
s = s.withColumn('mid', F.monotonically_increasing_id())
#simple left join
df = df.join(s, on='mid').drop('mid').show()
+---+--------------------+-----+------+-----+
| id| column|col<2| col>2|col=2|
+---+--------------------+-----+------+-----+
| 0|[0.2, 2.0, 3.0, 4...|[0.7]|[10.0]|[2.0]|
| 1|[7.0, 0.3, 0.3, 8...|[0.6]|[15.0]|[2.0]|
+---+--------------------+-----+------+-----+
For Spark 2.4+, you can use aggregate function and do the calculation in one step:
from pyspark.sql.functions import expr
# I adjusted the 2nd array-item in id=1 from 2.0 to 2.1 so there is no `2.0` when id=1
df = spark.createDataFrame([(1,[0.2, 2.1, 3., 4., 3., 0.5]),(2,[7., 0.3, 0.3, 8., 2.,])],['id','column'])
df.withColumn('data', expr("""
aggregate(
/* ArrayType argument */
column,
/* zero: set empty array to initialize acc */
array(),
/* merge: iterate through `column` and reduce based on the values of y and the array indices of acc */
(acc, y) ->
CASE
WHEN y < 2.0 THEN array(IFNULL(acc[0],0) + y, acc[1], acc[2])
WHEN y > 2.0 THEN array(acc[0], IFNULL(acc[1],0) + y, acc[2])
ELSE array(acc[0], acc[1], IFNULL(acc[2],0) + y)
END,
/* finish: to convert the array into a named_struct */
acc -> (acc[0] as `column<2`, acc[1] as `column>2`, acc[2] as `column=2`)
)
""")).selectExpr('id', 'data.*').show()
#+---+--------+--------+--------+
#| id|column<2|column>2|column=2|
#+---+--------+--------+--------+
#| 1| 0.7| 12.1| null|
#| 2| 0.6| 15.0| 2.0|
#+---+--------+--------+--------+
Before Spark 2.4, the functional-support for ArrayType is limited, you might do it with explode and then groupby+pivot:
from pyspark.sql.functions import sum as fsum, expr
df.selectExpr('id', 'explode_outer(column) as item') \
.withColumn('g', expr('if(item < 2, "column<2", if(item > 2, "column>2", "column=2"))')) \
.groupby('id') \
.pivot('g', ["column<2", "column>2", "column=2"]) \
.agg(fsum('item')) \
.show()
#+---+--------+--------+--------+
#| id|column<2|column>2|column=2|
#+---+--------+--------+--------+
#| 1| 0.7| 12.1| null|
#| 2| 0.6| 15.0| 2.0|
#+---+--------+--------+--------+
In case explode is slow (i.e. SPARK-21657 shown before Spark 2.3), use an UDF:
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, DoubleType
schema = StructType([
StructField("column>2", DoubleType()),
StructField("column<2", DoubleType()),
StructField("column=2", DoubleType())
])
def split_data(arr):
d = {}
if arr is None: arr = []
for y in arr:
if y > 2:
d['column>2'] = d.get('column>2',0) + y
elif y < 2:
d['column<2'] = d.get('column<2',0) + y
else:
d['column=2'] = d.get('column=2',0) + y
return d
udf_split_data = udf(split_data, schema)
df.withColumn('data', udf_split_data('column')).selectExpr('id', 'data.*').show()