Subset one array column with another (boolean) array column - apache-spark

I have a Dataframe like this (in Pyspark 2.3.1):
from pyspark.sql import Row
my_data = spark.createDataFrame([
Row(a=[9, 3, 4], b=['a', 'b', 'c'], mask=[True, False, False]),
Row(a=[7, 2, 6, 4], b=['w', 'x', 'y', 'z'], mask=[True, False, True, False])
])
my_data.show(truncate=False)
#+------------+------------+--------------------------+
#|a |b |mask |
#+------------+------------+--------------------------+
#|[9, 3, 4] |[a, b, c] |[true, false, false] |
#|[7, 2, 6, 4]|[w, x, y, z]|[true, false, true, false]|
#+------------+------------+--------------------------+
Now I'd like to use the mask column in order to subset the a and b columns:
my_desired_output = spark.createDataFrame([
Row(a=[9], b=['a']),
Row(a=[7, 6], b=['w', 'y'])
])
my_desired_output.show(truncate=False)
#+------+------+
#|a |b |
#+------+------+
#|[9] |[a] |
#|[7, 6]|[w, y]|
#+------+------+
What's the "idiomatic" way to achieve this? The current solution I have involves map-ing over the underlying RDD and subsetting with Numpy, which seems inelegant:
import numpy as np
def subset_with_mask(row):
mask = np.asarray(row.mask)
a_masked = np.asarray(row.a)[mask].tolist()
b_masked = np.asarray(row.b)[mask].tolist()
return Row(a=a_masked, b=b_masked)
my_desired_output = spark.createDataFrame(my_data.rdd.map(subset_with_mask))
Is this the best way to go, or is there something better (less verbose and/or more efficient) I can do using Spark SQL tools?

One option is to use a UDF, which you can optionally specialize by the data type in the array:
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T
def _mask_list(lst, mask):
return np.asarray(lst)[mask].tolist()
mask_array_int = F.udf(_mask_list, T.ArrayType(T.IntegerType()))
mask_array_str = F.udf(_mask_list, T.ArrayType(T.StringType()))
my_desired_output = my_data
my_desired_output = my_desired_output.withColumn(
'a', mask_array_int(F.col('a'), F.col('mask'))
)
my_desired_output = my_desired_output.withColumn(
'b', mask_array_str(F.col('b'), F.col('mask'))
)

UDFs mentioned in the previous answer is probably the way to go prior to the array functions added in Spark 2.4. For the sake of completeness, here is a "pure SQL" implementation before 2.4.
from pyspark.sql.functions import *
df = my_data.withColumn("row", monotonically_increasing_id())
df1 = df.select("row", posexplode("a").alias("pos", "a"))
df2 = df.select("row", posexplode("b").alias("pos", "b"))
df3 = df.select("row", posexplode("mask").alias("pos", "mask"))
df1\
.join(df2, ["row", "pos"])\
.join(df3, ["row", "pos"])\
.filter("mask")\
.groupBy("row")\
.agg(collect_list("a").alias("a"), collect_list("b").alias("b"))\
.select("a", "b")\
.show()
Output:
+------+------+
| a| b|
+------+------+
|[7, 6]|[w, y]|
| [9]| [a]|
+------+------+

A better way to do this is to use pyspark.sql.functions.expr, filter, and transform:
import pandas as pd
from pyspark.sql import (
functions as F,
SparkSession
)
spark = SparkSession.builder.master('local[4]').getOrCreate()
bool_df = pd.DataFrame([
['a', [0, 1, 2, 3, 4], [True]*4 + [False]],
['b', [5, 6, 7, 8, 9], [False, True, False, True, False]]
], columns=['id', 'int_arr', 'bool_arr'])
bool_sdf = spark.createDataFrame(bool_df)
def filter_with_mask(in_col, mask_col, out_name="masked_arr"):
filt_input = f'arrays_zip({in_col}, {mask_col})'
filt_func = f'x -> x.{mask_col}'
trans_func = f'x -> x.{in_col}'
result = F.expr(f'''transform(
filter({filt_input}, {filt_func}), {trans_func}
)''').alias
return result
Using the function:
bool_sdf.select(
'*', filter_with_mask('int_arr', 'bool_arr', bool_sdf)
).toPandas()
Results in:
id int_arr bool_arr masked_arr
a [0, 1, 2, 3, 4] [True, True, True, True, False] [0, 1, 2, 3]
b [5, 6, 7, 8, 9] [False, True, False, True, False] [6, 8]
This should be possible with pyspark >= 2.4.0 and python >= 3.6.

Related

PySpark RDD: Manipulating Inner Array

I have a dataset (for example)
sc = SparkContext()
x = [(1, [2, 3, 4, 5]), (2, [2, 7, 8, 10])]
y = sc.parallelize(x)
print(y.take(1))
The print statement returns [(1, [2, 3, 4, 5])]
I now need to multiply everything in the sub-array by 2 across the RDD. Since I have already parallelized, I can't further break down "y.take(1)" to multiply [2, 3, 4, 5] by 2.
How can I essentially isolate the inner array across my worker nodes to then do the multiplication?
I think you can use map with a lambda function:
y = sc.parallelize(x).map(lambda x: (x[0], [2*t for t in x[1]]))
Then y.take(2) returns:
[(1, [4, 6, 8, 10]), (2, [4, 14, 16, 20])]
It will be more efficient if you use DataFrame API instead of RDDs - in this case all your processing will happen without serialization to Python that happens when you use RDD APIs.
For example you can use the transform function to apply transformation to array values:
import pyspark.sql.functions as F
df = spark.createDataFrame([(1, [2, 3, 4, 5]), (2, [2, 7, 8, 10])],
schema="id int, arr array<int>")
df2 = df.select("id", F.transform("arr", lambda x: x*2).alias("arr"))
df2.show()
will give you desired:
+---+---------------+
| id| arr|
+---+---------------+
| 1| [4, 6, 8, 10]|
| 2|[4, 14, 16, 20]|
+---+---------------+

processing of a complex object (array) in pyspark

I am trying to figure out possible ways to process complex objects in pyspark. In the example below one of the columns of the dataframe is an array of integers. The processing is simply adding one to each value. Are these acceptable methods or there is a better practice?
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
spark = SparkSession.builder.enableHiveSupport().appName('learn').getOrCreate()
data = [('a', 1, [1, 3, 5]),
('b', 2, [4, 6, 9]),
('c', 3, [50, 60, 70, 80])]
df = spark.createDataFrame(data, ['nam', 'q', 'compl'])
# process complex object, method 1 using explode and collect_list (dataframe API)
res = df.withColumn('id', f.monotonically_increasing_id()).withColumn('compl_exploded', f.explode(f.col('compl')))
res = res.withColumn('compl_exploded', f.col('compl_exploded')+1)
res = res.groupby('id').agg(f.first('nam'), f.first('q'), f.collect_list('compl_exploded').alias('compl')).drop('id')
res.show()
# process complex object, method 2 using explode and collect_list (SQL)
df.withColumn('id', f.monotonically_increasing_id()).createOrReplaceTempView('tmp_view')
res = spark.sql("""
SELECT first(nam) AS nam, first(q) AS q, collect_list(compl_exploded+1) AS compl FROM (
SELECT *, explode(compl) AS compl_exploded FROM tmp_view
) x
GROUP BY id
""")
res.show()
# process complex object, method 3 using python UDF
from typing import List
def process(x: List[int]) -> List[int]:
return [_+1 for _ in x]
process_udf = f.udf(process, ArrayType(LongType()))
res = df.withColumn('compl', process_udf('compl'))
res.show()
For such operation you can take advantage of in build functions.
For e.g in your usecase you can use transform like below :
pyspark<=3.0
# Option 1
import pyspark.sql.functions as f
df.withColumn('add_one',f.expr('transform(compl, x -> x+1)')).show()
+---+---+----------------+----------------+
|nam| q| compl| add_one|
+---+---+----------------+----------------+
| a| 1| [1, 3, 5]| [2, 4, 6]|
| b| 2| [4, 6, 9]| [5, 7, 10]|
| c| 3|[50, 60, 70, 80]|[51, 61, 71, 81]|
+---+---+----------------+----------------+
# OR below options , all will give same output
# Option 2
df.select('nam', 'q', 'compl' , f.expr('transform(compl, x -> x+1) as add_one')).show()
# Option 3
df.createOrReplaceTempView('tmp_view')
spark.sql( 'select nam, q, compl , transform(compl, x -> x+1) as add_one from tmp_view').show()
pyspark>=3.1.0
If you are using newer version of spark then this function is easily available and you can use without expr.

Alternative of groupby in Pyspark to improve performance of Pyspark code

My Pyspark data frame looks like this. I have to remove group by function from pyspark code to increase the performance of the code. I have to perform operations on 100k data.
[Initial Data]
To create Dataframe
df = spark.createDataFrame([
(0, ['-9.53', '-9.35', '0.18']),
(1, ['-7.77', '-7.61', '0.16']),
(2, ['-5.80', '-5.71', '0.10']),
(0, ['1', '2', '3']),
(1, ['4', '5', '6']),
(2, ['8', '98', '32'])
], ["id", "Array"])
And the expected output is produced using this code.
import pyspark.sql.functions as f
df.groupBy('id').agg(f.collect_list(f.col("Array")).alias('Array')).\
select("id",f.flatten("Array")).show()
I have to achieve the output in this format. The above code is giving me this output. I have to achieve the same by removing the groupby function.
+---+-------------------------------+
|id |flatten(Array) |
+---+-------------------------------+
|0 |[-9.53, -9.35, 0.18, 1, 2, 3] |
|1 |[-7.77, -7.61, 0.16, 4, 5, 6] |
|2 |[-5.80, -5.71, 0.10, 8, 98, 32]|
+---+-------------------------------+
If you don't want to do group by, you can use window functions:
import pyspark.sql.functions as f
from pyspark.sql.window import Window
df2 = df.select(
"id",
f.flatten(f.collect_list(f.col("Array")).over(Window.partitionBy("id"))).alias("Array")
).distinct()
df2.show(truncate=False)
+---+-------------------------------+
|id |Array |
+---+-------------------------------+
|0 |[-9.53, -9.35, 0.18, 1, 2, 3] |
|1 |[-7.77, -7.61, 0.16, 4, 5, 6] |
|2 |[-5.80, -5.71, 0.10, 8, 98, 32]|
+---+-------------------------------+
You can also try
df.select(
'id',
f.explode('Array').alias('Array')
).groupBy('id').agg(f.collect_list('Array').alias('Array'))
Although I'm not sure if it'll be faster.

Ffill and interpolate koalas dataframe

Is it possible to interpolate and ffill different columns in a Koalas dataframe something like this?
%%spark -s sparkenv2
kdf = ks.DataFrame({
'id':[1,2,3,4],
'A': [None, 3, None, None],
'B': [2, 4, None, 3],
'C': [99, None, None, 1],
'D': [0, 1, 5, 4]
},
columns=['id','A', 'B', 'C', 'D'])
kdf['A']=kdf['A'].ffill()
kdf['B']=kdf['B'].interpolate()
For ffill, this is taken from John Paton's blog
from pyspark.sql import Window
from pyspark.sql.functions import last
spark_df = kdf.to_spark()
# define the window
window = Window.orderBy('id').rowsBetween(-sys.maxsize, 0)
# define the forward-filled column
filled_column = last(spark_df['A'], ignorenulls=True).over(window)
# do the fill
spark_df_filled = spark_df.withColumn('A_filled', filled_column)
I have no answer for interpolate - still trying to find it myself.
PS - You can switch to backfill, by changing rowsBetween(0, max.size) and using first() rather than last().

Python Pandas - Update row with dictionary based on index, column

I have a dataframe with empty columns and a corresponding dictionary which I would like to update the empty columns with based on index, column:
import pandas as pd
import numpy as np
dataframe = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [4, 6, 2], [3, 4, 1]])
dataframe.columns = ['x', 'y', 'z']
additional_cols = ['a', 'b', 'c']
for col in additional_cols:
dataframe[col] = np.nan
x y z a b c
0 1 2 3
1 4 5 6
2 7 8 9
3 4 6 2
4 3 4 1
for row, column in x.iterrows():
#caluclations to return dictionary y
y = {"a": 5, "b": 6, "c": 7}
df.loc[row, :].map(y)
Basically after performing the calculations using columns x, y, z I would like to update columns a, b, c for that same row :)
I could use a function as such but as far as the pandas library and a method for the DataFrame object I am not sure...
def update_row_with_dict(dictionary, dataframe, index):
for key in dictionary.keys():
dataframe.loc[index, key] = dictionary.get(key)
The above answer with correct indent
def update_row_with_dict(df,d,idx):
for key in d.keys():
df.loc[idx, key] = d.get(key)
more short would be
def update_row_with_dict(df,d,idx):
df.loc[idx,d.keys()] = d.values()
for your code snipped the syntax would be:
import pandas as pd
import numpy as np
dataframe = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9], [4, 6, 2], [3, 4, 1]])
dataframe.columns = ['x', 'y', 'z']
additional_cols = ['a', 'b', 'c']
for col in additional_cols:
dataframe[col] = np.nan
for idx in dataframe.index:
y = {'a':1,'b':2,'c':3}
update_row_with_dict(dataframe,y,idx)

Resources