How to find count of Null and Nan values for each column in a PySpark dataframe efficiently? - apache-spark

import numpy as np
data = [
(1, 1, None),
(1, 2, float(5)),
(1, 3, np.nan),
(1, 4, None),
(1, 5, float(10)),
(1, 6, float("nan")),
(1, 6, float("nan")),
]
df = spark.createDataFrame(data, ("session", "timestamp1", "id2"))
Expected output
dataframe with count of nan/null for each column
Note:
The previous questions I found in stack overflow only checks for null & not nan.
That's why I have created a new question.
I know I can use isnull() function in Spark to find number of Null values in Spark column but how to find Nan values in Spark dataframe?

You can use method shown here and replace isNull with isnan:
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()
+-------+----------+---+
|session|timestamp1|id2|
+-------+----------+---+
| 0| 0| 3|
+-------+----------+---+
or
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()
+-------+----------+---+
|session|timestamp1|id2|
+-------+----------+---+
| 0| 0| 5|
+-------+----------+---+

For null values in the dataframe of pyspark
Dict_Null = {col:df.filter(df[col].isNull()).count() for col in df.columns}
Dict_Null
# The output in dict where key is column name and value is null values in that column
{'#': 0,
'Name': 0,
'Type 1': 0,
'Type 2': 386,
'Total': 0,
'HP': 0,
'Attack': 0,
'Defense': 0,
'Sp_Atk': 0,
'Sp_Def': 0,
'Speed': 0,
'Generation': 0,
'Legendary': 0}

To make sure it does not fail for string, date and timestamp columns:
import pyspark.sql.functions as F
def count_missings(spark_df,sort=True):
"""
Counts number of nulls and nans in each column
"""
df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp', 'string', 'date')]).toPandas()
if len(df) == 0:
print("There are no any missing values!")
return None
if sort:
return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)
return df
If you want to see the columns sorted based on the number of nans and nulls in descending:
count_missings(spark_df)
# | Col_A | 10 |
# | Col_C | 2 |
# | Col_B | 1 |
If you don't want ordering and see them as a single row:
count_missings(spark_df, False)
# | Col_A | Col_B | Col_C |
# | 10 | 1 | 2 |

An alternative to the already provided ways is to simply filter on the column like so
import pyspark.sql.functions as F
df = df.where(F.col('columnNameHere').isNull())
This has the added benefit that you don't have to add another column to do the filtering and it's quick on larger data sets.

Here is my one liner.
Here 'c' is the name of the column
from pyspark.sql.functions import isnan, when, count, col, isNull
df.select('c').withColumn('isNull_c',F.col('c').isNull()).where('isNull_c = True').count()

I prefer this solution:
df = spark.table(selected_table).filter(condition)
counter = df.count()
df = df.select([(counter - count(c)).alias(c) for c in df.columns])

Use the following code to identify the null values in every columns using pyspark.
def check_nulls(dataframe):
'''
Check null values and return the null values in pandas Dataframe
INPUT: Spark Dataframe
OUTPUT: Null values
'''
# Create pandas dataframe
nulls_check = pd.DataFrame(dataframe.select([count(when(isnull(c), c)).alias(c) for c in dataframe.columns]).collect(),
columns = dataframe.columns).transpose()
nulls_check.columns = ['Null Values']
return nulls_check
#Check null values
null_df = check_nulls(raw_df)
null_df

from pyspark.sql import DataFrame
import pyspark.sql.functions as fn
# compatiable with fn.isnan. Sourced from
# https://github.com/apache/spark/blob/13fd272cd3/python/pyspark/sql/functions.py#L4818-L4836
NUMERIC_DTYPES = (
'decimal',
'double',
'float',
'int',
'bigint',
'smallilnt',
'tinyint',
)
def count_nulls(df: DataFrame) -> DataFrame:
isnan_compat_cols = {c for (c, t) in df.dtypes if any(t.startswith(num_dtype) for num_dtype in NUMERIC_DTYPES)}
return df.select(
[fn.count(fn.when(fn.isnan(c) | fn.isnull(c), c)).alias(c) for c in isnan_compat_cols]
+ [fn.count(fn.when(fn.isnull(c), c)).alias(c) for c in set(df.columns) - isnan_compat_cols]
)
Builds off of gench and user8183279's answers, but checks via only isnull for columns where isnan is not possible, rather than just ignoring them.
The source code of pyspark.sql.functions seemed to have the only documentation I could really find enumerating these names — if others know of some public docs I'd be delighted.

if you are writing spark sql, then the following will also work to find null value and count subsequently.
spark.sql('select * from table where isNULL(column_value)')

Yet another alternative (improved upon Vamsi Krishna's solutions above):
def check_for_null_or_nan(df):
null_or_nan = lambda x: isnan(x) | isnull(x)
func = lambda x: df.filter(null_or_nan(x)).count()
print(*[f'{i} has {func(i)} nans/nulls' for i in df.columns if func(i)!=0],sep='\n')
check_for_null_or_nan(df)
id2 has 5 nans/nulls

Here is a readable solution because code is for people as much as computers ;-)
df.selectExpr('sum(int(isnull(<col_name>) or isnan(<col_name>))) as null_or_nan_count'))

Related

transform function in pyspark

I was reading the official documentation of PySpark API reference for dataframe and below code snippet for transform function over a dataframe have me confused. I can't figure out why * is placed before sorted function in sort_columns_asc function defined below
from pyspark.sql.functions import col
df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
def cast_all_to_int(input_df):
return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
def sort_columns_asc(input_df):
return input_df.select(*sorted(input_df.columns))
df.transform(cast_all_to_int).transform(sort_columns_asc).show()
+-----+---+
|float|int|
+-----+---+
| 1| 1|
| 2| 2|
+-----+---+
Please help me clarify the confusion.
It's used to unpack arrays/collections from a higher dimension.
# 1D Array
collection1 = [1,2,3,4]
print(*collection1)
1 2 3 4
# 2D Array
collection2 = [[1,2,3,4]]
print(*collection2)
[1, 2, 3, 4]
In your example you are unpacking the names of the column names from
example = ["int", "float"]
to
print(*sorted(example))
float int
Check out this for further information.

How to concat two ArrayType(StringType()) columns element-wise in Pyspark?

I have two ArrayType(StringType()) columns in a spark dataframe, and I want to concatenate the two columns element-wise:
input:
+-------------+-------------+
|col1 |col2 |
+-------------+-------------+
|['a','b'] |['c','d'] |
|['a','b','c']|['e','f','g']|
+-------------+-------------+
output:
+-------------+-------------+----------------+
|col1 |col2 |col3 |
+-------------+-------------+----------------+
|['a','b'] |['c','d'] |['ac', 'bd'] |
|['a','b','c']|['e','f','g']|['ae','bf','cg']|
+-------------+----------- -+----------------+
Thanks.
For Spark 2.4+, you can use zip_with function:
zip_with(left, right, func) - Merges the two given arrays,
element-wise, into a single array using function
df.withColumn("col3", expr("zip_with(col1, col2, (x, y) -> concat(x, y))")).show()
#+------+------+--------+
#| col1| col2| col3|
#+------+------+--------+
#|[a, b]|[c, d]|[ac, bd]|
#+------+------+--------+
Another way using transform function like this:
df.withColumn("col3", expr("transform(col1, (x, i) -> concat(x, col2[i]))"))
The transform function takes as parameters the first array column col1, iterates over its elements and applies a lambda function (x, i) -> concat(x, col2[i]) where x the actual element and i its index used to get the corresponding element from array col2.
Here is an alternative answer that can be used for the updated non-original question. Uses array and array_except to demonstrate the use of such methods. The accepted answer is more elegant.
from pyspark.sql.functions import *
from pyspark.sql.types import *
# Arbitrary max number of elements to apply array over, need not broadcast such a small amount of data afaik.
max_entries = 5
# Gen in this case numeric data, etc. 3 rows with 2 arrays of varying length,but per row constant length.
dfA = spark.createDataFrame([ ( list([x,x+1,4, x+100]), 4, list([x+100,x+200,999,x+500]) ) for x in range(3)], ['array1', 'value1', 'array2'] ).withColumn("s",size(col("array1")))
dfB = spark.createDataFrame([ ( list([x,x+1]), 4, list([x+100,x+200]) ) for x in range(5)], ['array1', 'value1', 'array2'] ).withColumn("s",size(col("array1")))
df = dfA.union(dfB)
# concat the array elements which are variable in size up to a max amount.
df2 = df.select(( [concat(col("array1")[i], col("array2")[i]) for i in range(max_entries)]))
df3 = df2.withColumn("res", array(df2.schema.names))
# Get results but strip out null entires from array.
df3.select(array_except(df3.res, array(lit(None)))).show(truncate=False)
Could not get the s value of column to be passed into range.
This returns:
+------------------------------+
|array_except(res, array(NULL))|
+------------------------------+
|[0100, 1200, 4999, 100500] |
|[1101, 2201, 4999, 101501] |
|[2102, 3202, 4999, 102502] |
|[0100, 1200] |
|[1101, 2201] |
|[2102, 3202] |
|[3103, 4203] |
|[4104, 5204] |
+------------------------------+
It wouldn't really scale, but you could get the 0th and 1st entries in each array and then say col3 is a[0] + b[0] and then a[1] + b[1].
Make all 4 entries separate values and then output them combined.
Here is a generic answer. Just look at res for the result. 2 equally sized arrays, thus n elements for both.
from pyspark.sql.functions import *
from pyspark.sql.types import *
# Gen in this case numeric data, etc. 3 rows with 2 arrays of varying length, but both the same length as in your example
df = spark.createDataFrame([ ( list([x,x+1,4, x+100]), 4, list([x+100,x+200,999,x+500]) ) for x in range(3)], ['array1', 'value1', 'array2'] )
num_array_elements = len(df.select("array1").first()[0])
# concat
df2 = df.select(([ concat(col("array1")[i], col("array2")[i]) for i in range(num_array_elements)]))
df2.withColumn("res", array(df2.schema.names)).show(truncate=False)
returns:

Building derived column using Spark transformations

I got a table record as stated below.
Id Indicator Date
1 R 2018-01-20
1 R 2018-10-21
1 P 2019-01-22
2 R 2018-02-28
2 P 2018-05-22
2 P 2019-03-05
I need to pick the Ids that had more than two R indicator in the last one year and derive a new column called Marked_Flag as Y otherwise N. So the expected output should look like below,
Id Marked_Flag
1 Y
2 N
So what I did so far, I took the records in a dataset and then again build another dataset from that. The code looks like below.
Dataset<row> getIndicators = spark.sql("select id, count(indicator) as indi_count from source group by id having indicator = 'R'");
Dataset<row>getFlag = spark.sql("select id, case when indi_count > 1 then 'Y' else 'N' end as Marked_Flag" from getIndicators");
But my lead what this to be done using a single dataset and using Spark transformations. I am pretty new to Spark, any guidance or code snippet on this regard would be highly helpful.
Created two Datasets one to get the aggregation and another used the aggregated value to derive the new column.
Dataset<row> getIndicators = spark.sql("select id, count(indicator) as indi_count from source group by id having indicator = 'R'");
Dataset<row>getFlag = spark.sql("select id, case when indi_count > 1 then 'Y' else 'N' end as Marked_Flag" from getIndicators");
Input
Expected output
Try out the following. Note that I am using pyspark DataFrame here
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([
[1, "R", "2018-01-20"],
[1, "R", "2018-10-21"],
[1, "P", "2019-01-22"],
[2, "R", "2018-02-28"],
[2, "P", "2018-05-22"],
[2, "P", "2019-03-05"]], ["Id", "Indicator","Date"])
gr = df.filter(F.col("Indicator")=="R").groupBy("Id").agg(F.count("Indicator"))
gr = gr.withColumn("Marked_Flag", F.when(F.col("count(Indicator)") > 1, "Y").otherwise('N')).drop("count(Indicator)")
gr.show()
# +---+-----------+
# | Id|Marked_Flag|
# +---+-----------+
# | 1| Y|
# | 2| N|
# +---+-----------+
#

How to detect null column in pyspark

I have a dataframe defined with some null values. Some Columns are fully null values.
>> df.show()
+---+---+---+----+
| A| B| C| D|
+---+---+---+----+
|1.0|4.0|7.0|null|
|2.0|5.0|7.0|null|
|3.0|6.0|5.0|null|
+---+---+---+----+
In my case, I want to return a list of columns name that are filled with null values. My idea was to detect the constant columns (as the whole column contains the same null value).
this is how I did it:
nullCoulumns = [c for c, const in df.select([(min(c) == max(c)).alias(c) for c in df.columns]).first().asDict().items() if const]
but this does no consider null columns as constant, it works only with values.
How should I then do it ?
Extend the condition to
from pyspark.sql.functions import min, max
((min(c).isNull() & max(c).isNull()) | (min(c) == max(c))).alias(c)
or use eqNullSafe (PySpark 2.3):
(min(c).eqNullSafe(max(c))).alias(c)
One way would be to do it implicitly: select each column, count its NULL values, and then compare this with the total number or rows. With your data, this would be:
spark.version
# u'2.2.0'
from pyspark.sql.functions import col
nullColumns = []
numRows = df.count()
for k in df.columns:
nullRows = df.where(col(k).isNull()).count()
if nullRows == numRows: # i.e. if ALL values are NULL
nullColumns.append(k)
nullColumns
# ['D']
But there is a simpler way: it turns out that the function countDistinct, when applied to a column with all NULL values, returns zero (0):
from pyspark.sql.functions import countDistinct
df.agg(countDistinct(df.D).alias('distinct')).collect()
# [Row(distinct=0)]
So the for loop now can be:
nullColumns = []
for k in df.columns:
if df.agg(countDistinct(df[k])).collect()[0][0] == 0:
nullColumns.append(k)
nullColumns
# ['D']
UPDATE (after comments): It seems possible to avoid collect in the second solution; since df.agg returns a dataframe with only one row, replacing collect with take(1) will safely do the job:
nullColumns = []
for k in df.columns:
if df.agg(countDistinct(df[k])).take(1)[0][0] == 0:
nullColumns.append(k)
nullColumns
# ['D']
How about this? In order to guarantee the column are all nulls, two properties must be satisfied:
(1) The min value is equal to the max value
(2) The min or max is null
Or, equivalently
(1) The min AND max are both equal to None
Note that if property (2) is not satisfied, the case where column values are [null, 1, null, 1] would be incorrectly reported since the min and max will be 1.
import pyspark.sql.functions as F
def get_null_column_names(df):
column_names = []
for col_name in df.columns:
min_ = df.select(F.min(col_name)).first()[0]
max_ = df.select(F.max(col_name)).first()[0]
if min_ is None and max_ is None:
column_names.append(col_name)
return column_names
Here's an example in practice:
>>> rows = [(None, 18, None, None),
(1, None, None, None),
(1, 9, 4.0, None),
(None, 0, 0., None)]
>>> schema = "a: int, b: int, c: float, d:int"
>>> df = spark.createDataFrame(data=rows, schema=schema)
>>> df.show()
+----+----+----+----+
| a| b| c| d|
+----+----+----+----+
|null| 18|null|null|
| 1|null|null|null|
| 1| 9| 4.0|null|
|null| 0| 0.0|null|
+----+----+----+----+
>>> get_null_column_names(df)
['d']

SparkSQL — collect_set and sort_array does not sort integer column properly

I want to generate a sorted, collected set in SparkSQL, like so:
spark.sql("SELECT id, col_2, sort_array(collect_set(value)) AS collected
FROM my_table GROUP BY id, col_2").show()
where value is an integer.
But it fails to sort the array in proper numeric order — and does something rather ad hoc (sort on beginning of the first number in the value instead? Is sort_array operating on a string?).
So instead of:
+----+-------+------------+
| id | col_2 | collected |
+----+-------+------------+
| 1 | 2 | [456,1234]|
+----+-------+------------+
I get:
+----+-------+------------+
| id | col_2 | collected |
+----+-------+------------+
| 1 | 2 | [1234,456]|
+----+-------+------------+
EDIT:
Looking at what spark.sql(…) returns it is obvious that this query returns strings instead:
DataFrame[id: string, col_2: string, collected: array<string>]
How can that be when the original dataframe is all integers.
EDIT 2:
This seems to be a problem related to pyspark, as I'm not experiencing the problem with spark-shell and writing the same stuff in scala
I tested with Apache Spark 2.0.0.
It works for me. To make sure I tested with data [(1, 2, 1234), (1, 2, 456)] and [(1, 2, 456), (1, 2, 1234)]. The result is same.
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame([(1, 2, 1234), (1, 2, 456)], ['id', 'col_2', 'value'])
# test with reversed order, too
#df = sqlContext.createDataFrame([(1, 2, 456), (1, 2, 1234)], ['id', 'col_2', 'value'])
df.createOrReplaceTempView("my_table")
sqlContext.sql("SELECT id, col_2, sort_array(collect_set(value)) AS collected FROM my_table GROUP BY id, col_2").show()
Result
+---+-----+-----------+
| id|col_2| collected|
+---+-----+-----------+
| 1| 2|[456, 1234]|
+---+-----+-----------+
Some observations
when a value is None it appears as null e.g. [null, 456, 1234]
when there is a string value, Spark throws error "TypeError: Can not merge type LongType and StringType"
I think the problem is not the SQL but in the earlier steps where DataFrame was created.

Resources