PySpark replace less frequent items with most frequent items - apache-spark

I have a categorical column in a data frame which has some levels, and now I would like to replace those less frequent levels (which have frequencies in terms of percentage of total less than a specified percentage) with the most frequent level. How would I realize that in an elegant and compact way?
Below is an example, if I set the specified frequency as 0.3, then level "c" should be replaced by level "a" since it's frequency is only 1/6 which is below 0.3.
from pyspark.sql import Row
row = Row("foo")
df = sc.parallelize([ row("a"), row("b"), row("c"), row("a"), row("a"), row("b") ]).toDF()

from pyspark.sql import Row
import pyspark.sql.functions as f
#sample data
row = Row("foo")
df = sc.parallelize([ row("a"), row("b"), row("c"), row("a"), row("a"), row("b") ]).toDF()
df_temp = df.groupBy('foo').agg((f.count(f.lit(1))/df.count()).alias("frequency"))
most_frequent_foo = df_temp.sort(f.col('frequency').desc()).select('foo').first()[0]
df_temp = df_temp.withColumn('foo_replaced',
f.when(f.col("frequency") < 0.3, f.lit(most_frequent_foo)).otherwise(f.col('foo')))
df_final = df.join(df_temp, df.foo==df_temp.foo, 'left').drop(df_temp.foo).drop("frequency")
df_final.show()
Output is:
+---+------------+
|foo|foo_replaced|
+---+------------+
| c| a|
| b| b|
| b| b|
| a| a|
| a| a|
| a| a|
+---+------------+

Related

Explode multiple array columns in pyspark [duplicate]

I have a dataframe which consists lists in columns similar to the following. The length of the lists in all columns is not same.
Name Age Subjects Grades
[Bob] [16] [Maths,Physics,Chemistry] [A,B,C]
I want to explode the dataframe in such a way that i get the following output-
Name Age Subjects Grades
Bob 16 Maths A
Bob 16 Physics B
Bob 16 Chemistry C
How can I achieve this?
PySpark has added an arrays_zip function in 2.4, which eliminates the need for a Python UDF to zip the arrays.
import pyspark.sql.functions as F
from pyspark.sql.types import *
df = sql.createDataFrame(
[(['Bob'], [16], ['Maths','Physics','Chemistry'], ['A','B','C'])],
['Name','Age','Subjects', 'Grades'])
df = df.withColumn("new", F.arrays_zip("Subjects", "Grades"))\
.withColumn("new", F.explode("new"))\
.select("Name", "Age", F.col("new.Subjects").alias("Subjects"), F.col("new.Grades").alias("Grades"))
df.show()
+-----+----+---------+------+
| Name| Age| Subjects|Grades|
+-----+----+---------+------+
|[Bob]|[16]| Maths| A|
|[Bob]|[16]| Physics| B|
|[Bob]|[16]|Chemistry| C|
+-----+----+---------+------+
This works,
import pyspark.sql.functions as F
from pyspark.sql.types import *
df = sql.createDataFrame(
[(['Bob'], [16], ['Maths','Physics','Chemistry'], ['A','B','C'])],
['Name','Age','Subjects', 'Grades'])
df.show()
+-----+----+--------------------+---------+
| Name| Age| Subjects| Grades|
+-----+----+--------------------+---------+
|[Bob]|[16]|[Maths, Physics, ...|[A, B, C]|
+-----+----+--------------------+---------+
Use udf with zip. Those columns needed to explode have to be merged before exploding.
combine = F.udf(lambda x, y: list(zip(x, y)),
ArrayType(StructType([StructField("subs", StringType()),
StructField("grades", StringType())])))
df = df.withColumn("new", combine("Subjects", "Grades"))\
.withColumn("new", F.explode("new"))\
.select("Name", "Age", F.col("new.subs").alias("Subjects"), F.col("new.grades").alias("Grades"))
df.show()
+-----+----+---------+------+
| Name| Age| Subjects|Grades|
+-----+----+---------+------+
|[Bob]|[16]| Maths| A|
|[Bob]|[16]| Physics| B|
|[Bob]|[16]|Chemistry| C|
+-----+----+---------+------+
Arriving late to the party :-)
The simplest way to go is by using inline that doesn't have python API but is supported by selectExpr.
df.selectExpr('Name[0] as Name','Age[0] as Age','inline(arrays_zip(Subjects,Grades))').show()
+----+---+---------+------+
|Name|Age| Subjects|Grades|
+----+---+---------+------+
| Bob| 16| Maths| A|
| Bob| 16| Physics| B|
| Bob| 16|Chemistry| C|
+----+---+---------+------+
Have you tried this
df.select(explode(split(col("Subjects"))).alias("Subjects")).show()
you can convert the data frame to an RDD.
For an RDD you can use a flatMap function to separate the Subjects.
Copy/paste function if you need to repeat this quickly and easily across a large number of columns in a dataset
cols = ["word", "stem", "pos", "ner"]
def explode_cols(self, data, cols):
data = data.withColumn('exp_combo', f.arrays_zip(*cols))
data = data.withColumn('exp_combo', f.explode('exp_combo'))
for col in cols:
data = data.withColumn(col, f.col('exp_combo.' + col))
return data.drop(f.col('exp_combo'))
result = explode_cols(data, cols)
Your welcome :)
When Exploding multiple columns, the above solution comes in handy only when the length of array is same, but if they are not.
It is better to explode them separately and take distinct values each time.
df = sql.createDataFrame(
[(['Bob'], [16], ['Maths','Physics','Chemistry'], ['A','B','C'])],
['Name','Age','Subjects', 'Grades'])
df = df.withColumn('Subjects',F.explode('Subjects')).select('Name','Age','Subjects', 'Grades').distinct()
df = df.withColumn('Grades',F.explode('Grades')).select('Name','Age','Subjects', 'Grades').distinct()
df.show()
+----+---+---------+------+
|Name|Age| Subjects|Grades|
+----+---+---------+------+
| Bob| 16| Maths| A|
| Bob| 16| Physics| B|
| Bob| 16|Chemistry| C|
+----+---+---------+------+
Thanks #nasty for saving the day.
Just small tweaks to get the code working.
def explode_cols( df, cl):
df = df.withColumn('exp_combo', arrays_zip(*cl))
df = df.withColumn('exp_combo', explode('exp_combo'))
for colm in cl:
final_col = 'exp_combo.'+ colm
df = df.withColumn(final_col, col(final_col))
#print col
#print ('exp_combo.'+ colm)
return df.drop(col('exp_combo'))

How to replace negative values with 0 in pyspark dataframe

I want to replace all negative with 0 and nan values with 0 in pyspark dataframe with integer columns. I tried
df[df < 0] = 0
But getting error.
You can do this with a combination of reduce and when -
to_convert - Contains the list of columns you wise to convert to 0
Data Preparation
input_str = """
|-1|100
|10|-10
|200|-300
|-500|300
""".split("|")
input_values = list(map(lambda x: int(x.strip()), input_str[1:]))
input_list = [(x, y) for x, y in zip(input_values[0::2], input_values[1::2])]
sparkDF = sql.createDataFrame(input_list, ["a", "b"])
sparkDF.show()
+----+----+
| a| b|
+----+----+
| -1| 100|
| 10| -10|
| 200|-300|
|-500| 300|
+----+----+
Reduce and When
to_convert = set(['a'])
sparkDF = reduce(
lambda df, x: df.withColumn(x, F.when(F.col(x) < 0, 0).otherwise(F.col(x))),
to_convert,
sparkDF,
)
sparkDF.show()
+---+----+
| a| b|
+---+----+
| 0| 100|
| 10| -10|
|200|-300|
| 0| 300|
+---+----+
You can replace null values with 0 (or any value of your choice) across all columns with df.fillna(0) method. However to replace negative values across columns, I don't there is any direct approach, except using case when on each column as below.
from pyspark.sql import functions as F
df.withColumn(
"col1",
F.when(df["col1"] < 0, 0).when(F.col("col1").isNull(), 0).otherwise(F.col("col1")),
)

Dot Products of Rows of a Dataframe with a Fixed Vector in Spark

I have a dataframe (df1) with m rows and n columns in Spark. I have another dataframe (df2) with 1 row and n columns. How can I efficiently compute the dot product of each row of df1 with the single row of df2?
We can use VectorAssembler to do dot product.
Create sample DataFrames:
from pyspark.ml.linalg import Vectors, DenseVector
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import *
v = [('a', 1,2,3),
('b', 4,5,6),
('c', 9,8,7)]
df1 = spark.createDataFrame(v, ['id', 'v1', 'v2', 'v3'])
df2 = spark.createDataFrame([('d',3,2,1)], ['id', 'v1', 'v2', 'v3'])
df1.show()
df2.show()
They look like this:
+---+---+---+---+
| id| v1| v2| v3|
+---+---+---+---+
| a| 1| 2| 3|
| b| 4| 5| 6|
| c| 9| 8| 7|
+---+---+---+---+
+---+---+---+---+
| id| v1| v2| v3|
+---+---+---+---+
| d| 3| 2| 1|
+---+---+---+---+
Use VectorAssembler to convert the columns to Vector
vecAssembler = VectorAssembler(inputCols=["v1", "v2", "v3"], outputCol="values")
dfv1 = vecAssembler.transform(df1)
dfv2 = vecAssembler.transform(df2)
dfv1.show()
dfv2.show()
Now they look like this:
+---+---+---+---+-------------+
| id| v1| v2| v3| values|
+---+---+---+---+-------------+
| a| 1| 2| 3|[1.0,2.0,3.0]|
| b| 4| 5| 6|[4.0,5.0,6.0]|
| c| 9| 8| 7|[9.0,8.0,7.0]|
+---+---+---+---+-------------+
+---+---+---+---+-------------+
| id| v1| v2| v3| values|
+---+---+---+---+-------------+
| d| 3| 2| 1|[3.0,2.0,1.0]|
+---+---+---+---+-------------+
Define a udf to do the dot product
# Get the fixed vector from DataFrame dfv2
vm = Vectors.dense(dfv2.take(1)[0]['values'])
dot_prod_udf = F.udf(lambda v: float(v.dot(vm)), FloatType())
dfv1 = dfv1.withColumn('dot_prod', dot_prod_udf('values'))
dfv1.show()
The final result is:
+---+---+---+---+-------------+--------+
| id| v1| v2| v3| values|dot_prod|
+---+---+---+---+-------------+--------+
| a| 1| 2| 3|[1.0,2.0,3.0]| 10.0|
| b| 4| 5| 6|[4.0,5.0,6.0]| 28.0|
| c| 9| 8| 7|[9.0,8.0,7.0]| 50.0|
+---+---+---+---+-------------+--------+
You can simply form a matrix with the first data frame and another matrix with the second data frame and multiply them. Here is a code snippet to use (here I'm using block matrix since I assume your data frame can not be stored in your local machine)
v = [('a', [1,2,3]),
('b', [4,5,6]),
('c', [9,8,7])]
df1 = spark.createDataFrame(v, ['id', 'vec'])
df2 = spark.createDataFrame([('d',[3,2,1])], ['id', 'vec'])
m1 = matdf1.toBlockMatrix(100,100)
m2 = matdf2.toBlockMatrix(100,100)
m1.multiply(m2.transpose())
Since Python udfs are not very performant, its probably better to implement it in Scala. But if you want a pure Python implementation, you can try the following hack.
A dot product is equivalent to a linear prediction. So you can leverage LinearRegressionModel.transform if you create one with the desired coefficients. You can do that by "estimating" a regression with the features as an identity matrix and the labels as your desired coefficients. See the following implementation:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.sql import Row, DataFrame
class DotProduct:
_regressors_col = 'regressors'
_dot_product_col = 'dot_product'
_coef_col = 'coef'
_index_col = 'index'
def __init__(
self,
coefficients: np.ndarray
):
self._coefficients = coefficients
def transform(self, dataframe: DataFrame) -> DataFrame:
coef_df = self._convert_coefficients_to_dataframe()
w_identity_df = self._add_identity_matrix_as_regressors(coef_df)
linear_reg = LinearRegression(
featuresCol=self._regressors_col,
labelCol=self._coef_col,
predictionCol=self._dot_product_col,
fitIntercept=False,
standardization=False
)
model = linear_reg.fit(w_identity_df)
return model.transform(dataset=dataframe)
def _convert_coefficients_to_dataframe(self) -> DataFrame:
rows = [
Row(**{self._index_col: index, self._coef_col: float(coef)})
for index, coef
in enumerate(self._coefficients)
]
return spark_session.createDataFrame(rows)
def _add_identity_matrix_as_regressors(
self,
coefficients_df: DataFrame
) -> DataFrame:
encoder = OneHotEncoder(
inputCols=[self._index_col],
outputCols=[self._regressors_col],
dropLast=False
)
model = encoder.fit(coefficients_df)
return model.transform(coefficients_df)

Explode 2 columns (2 lists) in the same time in pyspark [duplicate]

I have a dataframe which has one row, and several columns. Some of the columns are single values, and others are lists. All list columns are the same length. I want to split each list column into a separate row, while keeping any non-list column as is.
Sample DF:
from pyspark import Row
from pyspark.sql import SQLContext
from pyspark.sql.functions import explode
sqlc = SQLContext(sc)
df = sqlc.createDataFrame([Row(a=1, b=[1,2,3],c=[7,8,9], d='foo')])
# +---+---------+---------+---+
# | a| b| c| d|
# +---+---------+---------+---+
# | 1|[1, 2, 3]|[7, 8, 9]|foo|
# +---+---------+---------+---+
What I want:
+---+---+----+------+
| a| b| c | d |
+---+---+----+------+
| 1| 1| 7 | foo |
| 1| 2| 8 | foo |
| 1| 3| 9 | foo |
+---+---+----+------+
If I only had one list column, this would be easy by just doing an explode:
df_exploded = df.withColumn('b', explode('b'))
# >>> df_exploded.show()
# +---+---+---------+---+
# | a| b| c| d|
# +---+---+---------+---+
# | 1| 1|[7, 8, 9]|foo|
# | 1| 2|[7, 8, 9]|foo|
# | 1| 3|[7, 8, 9]|foo|
# +---+---+---------+---+
However, if I try to also explode the c column, I end up with a dataframe with a length the square of what I want:
df_exploded_again = df_exploded.withColumn('c', explode('c'))
# >>> df_exploded_again.show()
# +---+---+---+---+
# | a| b| c| d|
# +---+---+---+---+
# | 1| 1| 7|foo|
# | 1| 1| 8|foo|
# | 1| 1| 9|foo|
# | 1| 2| 7|foo|
# | 1| 2| 8|foo|
# | 1| 2| 9|foo|
# | 1| 3| 7|foo|
# | 1| 3| 8|foo|
# | 1| 3| 9|foo|
# +---+---+---+---+
What I want is - for each column, take the nth element of the array in that column and add that to a new row. I've tried mapping an explode accross all columns in the dataframe, but that doesn't seem to work either:
df_split = df.rdd.map(lambda col: df.withColumn(col, explode(col))).toDF()
Spark >= 2.4
You can replace zip_ udf with arrays_zip function
from pyspark.sql.functions import arrays_zip, col, explode
(df
.withColumn("tmp", arrays_zip("b", "c"))
.withColumn("tmp", explode("tmp"))
.select("a", col("tmp.b"), col("tmp.c"), "d"))
Spark < 2.4
With DataFrames and UDF:
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType
from pyspark.sql.functions import col, udf, explode
zip_ = udf(
lambda x, y: list(zip(x, y)),
ArrayType(StructType([
# Adjust types to reflect data types
StructField("first", IntegerType()),
StructField("second", IntegerType())
]))
)
(df
.withColumn("tmp", zip_("b", "c"))
# UDF output cannot be directly passed to explode
.withColumn("tmp", explode("tmp"))
.select("a", col("tmp.first").alias("b"), col("tmp.second").alias("c"), "d"))
With RDDs:
(df
.rdd
.flatMap(lambda row: [(row.a, b, c, row.d) for b, c in zip(row.b, row.c)])
.toDF(["a", "b", "c", "d"]))
Both solutions are inefficient due to Python communication overhead. If data size is fixed you can do something like this:
from functools import reduce
from pyspark.sql import DataFrame
# Length of array
n = 3
# For legacy Python you'll need a separate function
# in place of method accessor
reduce(
DataFrame.unionAll,
(df.select("a", col("b").getItem(i), col("c").getItem(i), "d")
for i in range(n))
).toDF("a", "b", "c", "d")
or even:
from pyspark.sql.functions import array, struct
# SQL level zip of arrays of known size
# followed by explode
tmp = explode(array(*[
struct(col("b").getItem(i).alias("b"), col("c").getItem(i).alias("c"))
for i in range(n)
]))
(df
.withColumn("tmp", tmp)
.select("a", col("tmp").getItem("b"), col("tmp").getItem("c"), "d"))
This should be significantly faster compared to UDF or RDD. Generalized to support an arbitrary number of columns:
# This uses keyword only arguments
# If you use legacy Python you'll have to change signature
# Body of the function can stay the same
def zip_and_explode(*colnames, n):
return explode(array(*[
struct(*[col(c).getItem(i).alias(c) for c in colnames])
for i in range(n)
]))
df.withColumn("tmp", zip_and_explode("b", "c", n=3))
You'd need to use flatMap, not map as you want to make multiple output rows out of each input row.
from pyspark.sql import Row
def dualExplode(r):
rowDict = r.asDict()
bList = rowDict.pop('b')
cList = rowDict.pop('c')
for b,c in zip(bList, cList):
newDict = dict(rowDict)
newDict['b'] = b
newDict['c'] = c
yield Row(**newDict)
df_split = sqlContext.createDataFrame(df.rdd.flatMap(dualExplode))
One liner (for Spark>=2.4.0):
df.withColumn("bc", arrays_zip("b","c"))
.select("a", explode("bc").alias("tbc"))
.select("a", col"tbc.b", "tbc.c").show()
Import required:
from pyspark.sql.functions import arrays_zip
Steps -
Create a column bc which is an array_zip of columns b and c
Explode bc to get a struct tbc
Select the required columns a, b and c (all exploded as required).
Output:
> df.withColumn("bc", arrays_zip("b","c")).select("a", explode("bc").alias("tbc")).select("a", "tbc.b", col("tbc.c")).show()
+---+---+---+
| a| b| c|
+---+---+---+
| 1| 1| 7|
| 1| 2| 8|
| 1| 3| 9|
+---+---+---+

Convert column of lists into one column of values in Pyspark [duplicate]

I have a dataframe which has one row, and several columns. Some of the columns are single values, and others are lists. All list columns are the same length. I want to split each list column into a separate row, while keeping any non-list column as is.
Sample DF:
from pyspark import Row
from pyspark.sql import SQLContext
from pyspark.sql.functions import explode
sqlc = SQLContext(sc)
df = sqlc.createDataFrame([Row(a=1, b=[1,2,3],c=[7,8,9], d='foo')])
# +---+---------+---------+---+
# | a| b| c| d|
# +---+---------+---------+---+
# | 1|[1, 2, 3]|[7, 8, 9]|foo|
# +---+---------+---------+---+
What I want:
+---+---+----+------+
| a| b| c | d |
+---+---+----+------+
| 1| 1| 7 | foo |
| 1| 2| 8 | foo |
| 1| 3| 9 | foo |
+---+---+----+------+
If I only had one list column, this would be easy by just doing an explode:
df_exploded = df.withColumn('b', explode('b'))
# >>> df_exploded.show()
# +---+---+---------+---+
# | a| b| c| d|
# +---+---+---------+---+
# | 1| 1|[7, 8, 9]|foo|
# | 1| 2|[7, 8, 9]|foo|
# | 1| 3|[7, 8, 9]|foo|
# +---+---+---------+---+
However, if I try to also explode the c column, I end up with a dataframe with a length the square of what I want:
df_exploded_again = df_exploded.withColumn('c', explode('c'))
# >>> df_exploded_again.show()
# +---+---+---+---+
# | a| b| c| d|
# +---+---+---+---+
# | 1| 1| 7|foo|
# | 1| 1| 8|foo|
# | 1| 1| 9|foo|
# | 1| 2| 7|foo|
# | 1| 2| 8|foo|
# | 1| 2| 9|foo|
# | 1| 3| 7|foo|
# | 1| 3| 8|foo|
# | 1| 3| 9|foo|
# +---+---+---+---+
What I want is - for each column, take the nth element of the array in that column and add that to a new row. I've tried mapping an explode accross all columns in the dataframe, but that doesn't seem to work either:
df_split = df.rdd.map(lambda col: df.withColumn(col, explode(col))).toDF()
Spark >= 2.4
You can replace zip_ udf with arrays_zip function
from pyspark.sql.functions import arrays_zip, col, explode
(df
.withColumn("tmp", arrays_zip("b", "c"))
.withColumn("tmp", explode("tmp"))
.select("a", col("tmp.b"), col("tmp.c"), "d"))
Spark < 2.4
With DataFrames and UDF:
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType
from pyspark.sql.functions import col, udf, explode
zip_ = udf(
lambda x, y: list(zip(x, y)),
ArrayType(StructType([
# Adjust types to reflect data types
StructField("first", IntegerType()),
StructField("second", IntegerType())
]))
)
(df
.withColumn("tmp", zip_("b", "c"))
# UDF output cannot be directly passed to explode
.withColumn("tmp", explode("tmp"))
.select("a", col("tmp.first").alias("b"), col("tmp.second").alias("c"), "d"))
With RDDs:
(df
.rdd
.flatMap(lambda row: [(row.a, b, c, row.d) for b, c in zip(row.b, row.c)])
.toDF(["a", "b", "c", "d"]))
Both solutions are inefficient due to Python communication overhead. If data size is fixed you can do something like this:
from functools import reduce
from pyspark.sql import DataFrame
# Length of array
n = 3
# For legacy Python you'll need a separate function
# in place of method accessor
reduce(
DataFrame.unionAll,
(df.select("a", col("b").getItem(i), col("c").getItem(i), "d")
for i in range(n))
).toDF("a", "b", "c", "d")
or even:
from pyspark.sql.functions import array, struct
# SQL level zip of arrays of known size
# followed by explode
tmp = explode(array(*[
struct(col("b").getItem(i).alias("b"), col("c").getItem(i).alias("c"))
for i in range(n)
]))
(df
.withColumn("tmp", tmp)
.select("a", col("tmp").getItem("b"), col("tmp").getItem("c"), "d"))
This should be significantly faster compared to UDF or RDD. Generalized to support an arbitrary number of columns:
# This uses keyword only arguments
# If you use legacy Python you'll have to change signature
# Body of the function can stay the same
def zip_and_explode(*colnames, n):
return explode(array(*[
struct(*[col(c).getItem(i).alias(c) for c in colnames])
for i in range(n)
]))
df.withColumn("tmp", zip_and_explode("b", "c", n=3))
You'd need to use flatMap, not map as you want to make multiple output rows out of each input row.
from pyspark.sql import Row
def dualExplode(r):
rowDict = r.asDict()
bList = rowDict.pop('b')
cList = rowDict.pop('c')
for b,c in zip(bList, cList):
newDict = dict(rowDict)
newDict['b'] = b
newDict['c'] = c
yield Row(**newDict)
df_split = sqlContext.createDataFrame(df.rdd.flatMap(dualExplode))
One liner (for Spark>=2.4.0):
df.withColumn("bc", arrays_zip("b","c"))
.select("a", explode("bc").alias("tbc"))
.select("a", col"tbc.b", "tbc.c").show()
Import required:
from pyspark.sql.functions import arrays_zip
Steps -
Create a column bc which is an array_zip of columns b and c
Explode bc to get a struct tbc
Select the required columns a, b and c (all exploded as required).
Output:
> df.withColumn("bc", arrays_zip("b","c")).select("a", explode("bc").alias("tbc")).select("a", "tbc.b", col("tbc.c")).show()
+---+---+---+
| a| b| c|
+---+---+---+
| 1| 1| 7|
| 1| 2| 8|
| 1| 3| 9|
+---+---+---+

Resources