Pyspark eval or expr - Concatenating multiple dataframe columns using when statement - python-3.x

I am trying to concatenate multiple dataframe columns I am not able to perform pyspark eval or expr on the below when statement inside concat_ws.
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import concat_ws,concat,when,col,expr
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([("foo", "bar"), ("ba z", None)],
('a', 'b'))
keys = ['a','b']
key_val = ''
for key in keys:
key_val = key_val + 'when(df["{0}"].isNull(), lit("_")).otherwise(df["{0}"]),'.format(key)
key_val_exp = key_val.rsplit(',', 1)[0]
spaceDeleteUDF = udf(lambda s: str(s).replace(" ", "_").strip(), StringType())
df=df.withColumn("unique_id", spaceDeleteUDF(concat_ws("-",eval(key_val_exp))))
Error:
"TypeError: Invalid argument, not a string or column: (Column<b'CASE WHEN (a IS NULL) THEN _ ELSE a END'>, Column<b'CASE WHEN (b IS NULL) THEN _ ELSE b END'>) of type <class 'tuple'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function."
Expected output:
+----+----+---------+
| a| b|unique_id|
+----+----+---------+
| foo| bar| foo-bar|
|ba z|null| ba_z-_|
+----+----+---------+

check this out.
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([("foo", "bar"), ("ba z", None)],
('a', 'b'))
df.show()
# +----+----+
# | a| b|
# +----+----+
# | foo| bar|
# |ba z|null|
# +----+----+
df1 = df.select( *[F.col(column) for column in df.columns],*[ F.when(F.col(column).isNull(),F.lit('_')).otherwise(F.col(column)).alias(column+'_mod') for column in df.columns])
df2 = df1.select(*[F.col(column) for column in df1.columns if '_mod' not in column], *[ F.regexp_replace(column, r'\s', '_').alias(column) for column in df1.columns if '_mod' in column])
df3 = df2.select( *[F.col(column) for column in df1.columns if '_mod' not in column],F.concat_ws('-',*[F.col(column) for column in df2.columns if '_mod' in column]).alias('unique_id'))
df3.show()
# +----+----+---------+
# | a| b|unique_id|
# +----+----+---------+
# | foo| bar| foo-bar|
# |ba z|null| ba_z-_|
# +----+----+---------+

Related

Chain several WHEN conditions in a scalable way in PySpark

I have a dictionary (variable pats) with many when arguments: conditions and values.
from pyspark.sql import functions as F
df = spark.createDataFrame([("ė",), ("2",), ("",), ("#",)], ["col1"])
pats = {
r"^\d$" :"digit",
r"^\p{L}$" :"letter",
r"^[\p{P}\p{S}]$":"spec_char",
r"^$" :"empty"
}
whens = (
F.when(F.col("col1").rlike(list(pats.keys())[0]), pats[list(pats.keys())[0]])
.when(F.col("col1").rlike(list(pats.keys())[1]), pats[list(pats.keys())[1]])
.when(F.col("col1").rlike(list(pats.keys())[2]), pats[list(pats.keys())[2]])
.when(F.col("col1").rlike(list(pats.keys())[3]), pats[list(pats.keys())[3]])
.otherwise(F.col("col1"))
)
df = df.withColumn("col2", whens)
df.show()
# +----+---------+
# |col1| col2|
# +----+---------+
# | ė| letter|
# | 2| digit|
# | | empty|
# | #|spec_char|
# +----+---------+
I'm looking for a scalable way to chain all the when conditions, so I wouldn't need to write a line for every key.
Without reduce
whens = F
for k, v in pats.items():
whens = whens.when(F.col("col1").rlike(k), v)
whens = whens.otherwise(F.col("col1"))
Full code:
from pyspark.sql import functions as F
df = spark.createDataFrame([("ė",), ("2",), ("",), ("#",)], ["col1"])
pats = {
r"^\d$" :"digit",
r"^\p{L}$" :"letter",
r"^[\p{P}\p{S}]$":"spec_char",
r"^$" :"empty"
}
whens = F
for k, v in pats.items():
whens = whens.when(F.col("col1").rlike(k), v)
whens = whens.otherwise(F.col("col1"))
df = df.withColumn("col2", whens)
df.show()
# +----+---------+
# |col1| col2|
# +----+---------+
# | ė| letter|
# | 2| digit|
# | | empty|
# | #|spec_char|
# +----+---------+
Using reduce
from functools import reduce
whens = reduce(
lambda acc, p: acc.when(F.col("col1").rlike(p), pats[p]),
pats.keys(),
F
).otherwise(F.col("col1"))
Full code:
from pyspark.sql import functions as F
from functools import reduce
df = spark.createDataFrame([("ė",), ("2",), ("",), ("#",)], ["col1"])
pats = {
r"^\d$" :"digit",
r"^\p{L}$" :"letter",
r"^[\p{P}\p{S}]$":"spec_char",
r"^$" :"empty"
}
whens = reduce(
lambda acc, p: acc.when(F.col("col1").rlike(p), pats[p]),
pats.keys(),
F
).otherwise(F.col("col1"))
df = df.withColumn("col2", whens)
df.show()
# +----+---------+
# |col1| col2|
# +----+---------+
# | ė| letter|
# | 2| digit|
# | | empty|
# | #|spec_char|
# +----+---------+

Efficient way to add UUID in pyspark [duplicate]

This question already has answers here:
Pyspark add sequential and deterministic index to dataframe
(2 answers)
Closed 2 years ago.
I have a DataFrame that I want to add a column of distinct uuid4() rows. My code:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import StringType
from uuid import uuid4
spark_session = SparkSession.builder.getOrCreate()
df = spark_session.createDataFrame([
[1, 1, 'teste'],
[2, 2, 'teste'],
[3, 0, 'teste'],
[4, 5, 'teste'],
],
list('abc'))
df = df.withColumn("_tmp", f.lit(1))
uuids = [str(uuid4()) for _ in range(df.count())]
df1 = spark_session.createDataFrame(uuids, StringType())
df1 = df_1.withColumn("_tmp", f.lit(1))
df2 = df.join(df_1, "_tmp", "inner").drop("_tmp")
df2.show()
But I've got this ERROR:
Py4JJavaError: An error occurred while calling o1571.showString.
: org.apache.spark.sql.AnalysisException: Detected implicit cartesian product for INNER join between logical plans
I already try with alias and using monotonically_increasing_id as the join column, but I see
here that I cannot trust in monotonically_increasing_id as merge column.
I'm expecting:
+---+---+-----+------+
| a| b| c| value|
+---+---+-----+------+
| 1| 1|teste| uuid4|
| 2| 2|teste| uuid4|
| 3| 0|teste| uuid4|
| 4| 5|teste| uuid4|
+---+---+-----+------+
what's the correct approach in this case?
I use row_number as #Tetlanesh suggest. I have to create an ID column to ensure that row_number count every row of Window.
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from uuid import uuid4
from pyspark.sql.window import Window
from pyspark.sql.types import StringType
from pyspark.sql.functions import row_number
spark_session = SparkSession.builder.getOrCreate()
df = spark_session.createDataFrame([
[1, 1, 'teste'],
[1, 2, 'teste'],
[2, 0, 'teste'],
[2, 5, 'teste'],
],
list('abc'))
df = df.alias("_tmp")
df.registerTempTable("_tmp")
df2 = self.spark_session.sql("select *, uuid() as uuid from _tmp")
df2.show()
Another approach is using windows, but It's not efficient as the first one:
df = df.withColumn("_id", f.lit(1))
df = df.withColumn("_tmp", row_number().over(Window.orderBy('_id')))
uuids = [(str(uuid4()), 1) for _ in range(df.count())]
df1 = spark_session.createDataFrame(uuids, ['uuid', '_id'])
df1 = df1.withColumn("_tmp", row_number().over(Window.orderBy('_id')))
df2 = df.join(df1, "_tmp", "inner").drop('_id')
df2.show()
both outputs:
+---+---+-----+------+
| a| b| c| uuid|
+---+---+-----+------+
| 1| 1|teste| uuid4|
| 2| 2|teste| uuid4|
| 3| 0|teste| uuid4|
| 4| 5|teste| uuid4|
+---+---+-----+------+

How to remove words that have less than three letters in PySpark?

I have a 'text' column in which arrays of tokens are stored. How to filter all these arrays so that the tokens are at least three letters long?
from pyspark.sql.functions import regexp_replace, col
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
columns = ['id', 'text']
vals = [
(1, ['I', 'am', 'good']),
(2, ['You', 'are', 'ok']),
]
df = spark.createDataFrame(vals, columns)
df.show()
# Had tried this but have TypeError: Column is not iterable
# df_clean = df.select('id', regexp_replace('text', [len(word) >= 3 for word
# in col('text')], ''))
# df_clean.show()
I expect to see:
id | text
1 | [good]
2 | [You, are]
This does it, you can decide to exclude row or not, I added an extra column and filtered out, but options are yours:
from pyspark.sql import functions as f
columns = ['id', 'text']
vals = [
(1, ['I', 'am', 'good']),
(2, ['You', 'are', 'ok']),
(3, ['ok'])
]
df = spark.createDataFrame(vals, columns)
#df.show()
df2 = df.withColumn("text_left_over", f.expr("filter(text, x -> not(length(x) < 3))"))
df2.show()
# This is the actual piece of logic you are looking for.
df3 = df.withColumn("text_left_over", f.expr("filter(text, x -> not(length(x) < 3))")).where(f.size(f.col("text_left_over")) > 0).drop("text")
df3.show()
returns:
+---+--------------+--------------+
| id| text|text_left_over|
+---+--------------+--------------+
| 1| [I, am, good]| [good]|
| 2|[You, are, ok]| [You, are]|
| 3| [ok]| []|
+---+--------------+--------------+
+---+--------------+
| id|text_left_over|
+---+--------------+
| 1| [good]|
| 2| [You, are]|
+---+--------------+
This is the solution
filter_length_udf = udf(lambda row: [x for x in row if len(x) >= 3], ArrayType(StringType()))
df_final_words = df_stemmed.withColumn('words_filtered', filter_length_udf(col('words')))

How to change case of whole pyspark dataframe to lower or upper

I am trying to apply pyspark sql functions hash algorithm for every row in two dataframes to identify the differences. Hash algorithm is case sensitive .i.e. if column contains 'APPLE' and 'Apple' are considered as two different values, so I want to change the case for both dataframes to either upper or lower. I am able to achieve only for dataframe headers but not for dataframe values.Please help
#Code for Dataframe column headers
self.df_db1 =self.df_db1.toDF(*[c.lower() for c in self.df_db1.columns])
Assuming df is your dataframe, this should do the work:
from pyspark.sql import functions as F
for col in df.columns:
df = df.withColumn(col, F.lower(F.col(col)))
Both answers seems to be ok with one exception - if you have numeric column, it will be converted to string column. To avoid this, try:
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
val fields = df.schema.fields
val stringFields = df.schema.fields.filter(f => f.dataType == StringType)
val nonStringFields = df.schema.fields.filter(f => f.dataType != StringType).map(f => f.name).map(f => col(f))
val stringFieldsTransformed = stringFields .map (f => f.name).map(f => upper(col(f)).as(f))
val df = sourceDF.select(stringFieldsTransformed ++ nonStringFields: _*)
Now types are correct also when you have non-string fields, i.e. numeric fields).
If you know that each column is of String type, use one of the other answers - they are correct in that cases :)
Python code in PySpark:
from pyspark.sql.functions import *
from pyspark.sql.types import *
sourceDF = spark.createDataFrame([(1, "a")], ['n', 'n1'])
fields = sourceDF.schema.fields
stringFields = filter(lambda f: isinstance(f.dataType, StringType), fields)
nonStringFields = map(lambda f: col(f.name), filter(lambda f: not isinstance(f.dataType, StringType), fields))
stringFieldsTransformed = map(lambda f: upper(col(f.name)), stringFields)
allFields = [*stringFieldsTransformed, *nonStringFields]
df = sourceDF.select(allFields)
You can generate an expression using list comprehension:
from pyspark.sql import functions as psf
expression = [ psf.lower(psf.col(x)).alias(x) for x in df.columns ]
And then just call it over your existing dataframe
>>> df.show()
+---+---+---+---+
| c1| c2| c3| c4|
+---+---+---+---+
| A| B| C| D|
+---+---+---+---+
>>> df.select(*select_expression).show()
+---+---+---+---+
| c1| c2| c3| c4|
+---+---+---+---+
| a| b| c| d|
+---+---+---+---+

How to explode a column which is of ArrayType in spark dataframe which contains nulls and empty arrays.

I have a dataframe made up of following data
val df = List(
(1,"wwe",List(1,2,3)),
(2,"dsad",List.empty),
(3,"dfd",null)).toDF("id","name","value")
df.show
+---+----+---------+
| id|name| value|
+---+----+---------+
| 1| wwe|[1, 2, 3]|
| 2|dsad| []|
| 3| dfd| null|
+---+----+---------+
inorder to explode array column values I used the following logic
def explodeWithNull(f:StructField): Column ={
explode(
when(
col(f.name).isNotNull, col(f.name)
).otherwise(
f.dataType.asInstanceOf[ArrayType].elementType match{
case StringType => array(lit(""))
case DoubleType => array(lit(0.0))
case IntegerType => array(lit(0))
case _ => array(lit(""))
}
)
)
}
def explodeAllArraysColumns(dataframe: DataFrame): DataFrame = {
val schema: StructType = dataframe.schema
val arrayFileds: Seq[StructField] = schema.filter(f => f.dataType.typeName == "array")
arrayFileds.foldLeft(dataframe) {
(df: DataFrame, f: StructField) => df.withColumn(f.name,explodeWithNull(f))
}
}
explodeAllArraysColumns(df).show
+---+----+-----+
| id|name|value|
+---+----+-----+
| 1| wwe| 1|
| 1| wwe| 2|
| 1| wwe| 3|
| 3| dfd| 0|
+---+----+-----+
exploding this way I'm missing out the row which is an empty array in df. Ideally I don't want to miss that row,I either want a null or a default value for that column in the exploded dataframe.How to achieve this?
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql import Row
from pyspark.sql.types import ArrayType
from pyspark.sql.functions import *
from functools import reduce
def explode_outer(df, columns_to_explode):
array_fields = dict([(field.name, field.dataType)
for field in df.schema.fields
if type(field.dataType) == ArrayType])
return reduce(lambda df_with_explode, column:
df_with_explode.withColumn(column, explode(
when(size(df_with_explode[column]) != 0, df_with_explode[column])
.otherwise(array(lit(None).cast(array_fields[column].elementType))))),
columns_to_explode, df)
from pyspark.sql.functions import *
def flatten_df(nested_df):
flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct']
nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']
flat_df = nested_df.select(flat_cols +
[col(nc + '.' + c).alias(nc + '_' + c)
for nc in nested_cols
for c in nested_df.select(nc + '.*').columns])
print("flatten_df_count :", flat_df.count())
return flat_df
def explode_df(nested_df):
flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] != 'struct' and c[1][:5] != 'array']
array_cols = [c[0] for c in nested_df.dtypes if c[1][:5] == 'array']
for array_col in array_cols:
schema = new_df.select(array_col).dtypes[0][1]
nested_df = nested_df.withColumn(array_col, when(col(array_col).isNotNull(), col(array_col)).otherwise(array(lit(None)).cast(schema)))
nested_df = nested_df.withColumn("tmp", arrays_zip(*array_cols)).withColumn("tmp", explode("tmp")).select([col("tmp."+c).alias(c) for c in array_cols] + flat_cols)
print("explode_dfs_count :", nested_df.count())
return nested_df
new_df = flatten_df(myDf)
while True:
array_cols = [c[0] for c in new_df.dtypes if c[1][:5] == 'array']
if len(array_cols):
new_df = flatten_df(explode_df(new_df))
else:
break
new_df.printSchema()
Used arrays_zip and explode to solve this issue

Resources