PySpark: boolean previous values with conditions - apache-spark

I have mydata like this:
data = [("110125","James","2021-12-05","NY","PA",60000),("110125","James","2021-12-07","NY","PA",3000),("110125","James","2021-12-07","NY","AT",3000),
("5225","Michael","2021-12-25","LA","AT",60000),("5225","Michael","2021-12-17","LA","PA",15000),("5225","Michael","2021-12-17","LA","PA",65000)]
columns = ["id","Name","Date","Local","Office","salary"]
df = spark.createDataFrame(data = data, schema = columns)
Input:
+--------+--------+----------+-----+------+------+
| id |Name |Date |Local|Office|salary|
+--------+--------+----------+-----+------+------+
| 110125| James |2021-12-05|NY |PA | 60000|
| 110125| James |2021-12-07|NY |PA | 3000 |
| 110125| James |2021-12-07|NY |AT | 3000 |
| 5225 | Michael|2021-12-25|LA |AT | 60000|
| 5225 | Michael|2021-12-17|LA |PA | 15000|
| 5225 | Michael|2021-12-17|LA |PA | 65000|
+--------+--------+----------+-----+------+------+
I want a new column 'Check', if one of 4 values Date, Local; Offfice; Salary different with previous values and a same id, name so True.
Output:
+--------+--------+----------+-----+------+------+-----+
| id |Name |Date |Local|Office|salary|Check|
+--------+--------+----------+-----+------+------+-----+
| 110125| James |2021-12-05|NY |PA | 60000| |
| 110125| James |2021-12-07|NY |PA | 3000 | True|
| 110125| James |2021-12-07|NY |AT | 3000 | True|
| 5225 | Michael|2021-12-25|LA |AT | 60000| |
| 5225 | Michael|2021-12-17|LA |PA | 15000| True|
| 5225 | Michael|2021-12-17|LA |PA | 65000| True|
+--------+--------+----------+-----+------+------+-----+
My code PySpark:
df.groupby("ID", "Name").withColumn("Check", F.when((F.col('Local') == F.lag('Local')) |(F.col('Office') == F.lag('Office'))|
(F.col('Date') == F.lag('Date'))|(F.col('salary') == F.lag('salary')), False ).otherwise(True))
AttributeError: 'GroupedData' object has no attribute 'withColumn'

You want to use window:
from pyspark.sql import Window, functions as F
w = Window.partitionBy("id", "name").orderBy("Date")
df = df.withColumn(
"Check",
~((F.col('Local') == F.lag('Local').over(w))
& (F.col('Office') == F.lag('Office').over(w))
& (F.col('Date') == F.lag('Date').over(w))
& (F.col('salary') == F.lag('salary').over(w))
)
)
df.show()
#+------+-------+----------+-----+------+------+-----+
#| id| Name| Date|Local|Office|salary|Check|
#+------+-------+----------+-----+------+------+-----+
#|110125| James|2021-12-05| NY| PA| 60000| null|
#|110125| James|2021-12-07| NY| PA| 3000| true|
#|110125| James|2021-12-07| NY| AT| 3000| true|
#| 5225|Michael|2021-12-17| LA| PA| 15000| null|
#| 5225|Michael|2021-12-17| LA| PA| 65000| true|
#| 5225|Michael|2021-12-25| LA| AT| 60000| true|
#+------+-------+----------+-----+------+------+-----+

Related

PySpark percentile for multiple columns

I want to convert multiple numeric columns of PySpark dataframe into its percentile values using PySpark, without changing its order.
E.g. given an array of column names arr = [Salary, Age, Bonus] to convert columns into percentiles.
Input
+----------+-------------+---------+--------+-----+-------+
| Empl. No | Dept | Pincode | Salary | Age | Bonus |
+----------+-------------+---------+--------+-----+-------+
| 1 | HR | 111 | 1000 | 45 | 100 |
| 2 | Sales | 596 | 500 | 30 | 50 |
| 3 | Manufacture | 895 | 600 | 50 | 400 |
| 4 | HR | 212 | 700 | 26 | 60 |
| 5 | Business | 754 | 350 | 18 | 22 |
+----------+-------------+---------+--------+-----+-------+
Expected output
+----------+-------------+---------+--------+-----+-------+
| Empl. No | Dept | Pincode | Salary | Age | Bonus |
+----------+-------------+---------+--------+-----+-------+
| 1 | HR | 111 | 100 | 80 | 80 |
| 2 | Sales | 596 | 40 | 60 | 40 |
| 3 | Manufacture | 895 | 60 | 100 | 100 |
| 4 | HR | 212 | 80 | 40 | 60 |
| 5 | Business | 754 | 20 | 20 | 20 |
+----------+-------------+---------+--------+-----+-------+
The formula for percentile for a given element 'x' in the list = (Number of elements less than 'x'/Total number of elements) *100.
You can use percentile_approx for this , in conjunction with groupBy with the desired columns for which you want the percentile to be calculated.
Built in Spark > 3.x
input_list = [
(1,"HR",111,1000,45,100)
,(2,"Sales",112,500,30,50)
,(3,"Manufacture",127,600,50,500)
,(4,"Hr",821,700,26,60)
,(5,"Business",754,350,18,22)
]
sparkDF = sql.createDataFrame(input_list,['emp_no','dept','pincode','salary','age','bonus'])
sparkDF.groupBy(['emp_no','dept']).agg(
*[ F.first(F.col('pincode')).alias('pincode') ]
,*[ F.percentile_approx(F.col(col),0.95).alias(col) for col in ['salary','age','bonus'] ]
).show()
+------+-----------+-------+------+---+-----+
|emp_no| dept|pincode|salary|age|bonus|
+------+-----------+-------+------+---+-----+
| 3|Manufacture| 127| 600| 50| 500|
| 1| HR| 111| 1000| 45| 100|
| 2| Sales| 112| 500| 30| 50|
| 5| Business| 754| 350| 18| 22|
| 4| Hr| 821| 700| 26| 60|
+------+-----------+-------+------+---+-----+
Spark has a window function for calculating percentiles which is called percent_rank.
Test df:
from pyspark.sql import SparkSession, functions as F, Window as W
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(
[(1, "HR", 111, 1000, 45, 100),
(2, "Sales", 596, 500, 30, 50),
(3, "Manufacture", 895, 600, 50, 400),
(4, "HR", 212, 700, 26, 60),
(5, "Business", 754, 350, 18, 22)],
['Empl_No', 'Dept', 'Pincode', 'Salary', 'Age', 'Bonus'])
df.show()
# +-------+-----------+-------+------+---+-----+
# |Empl_No| Dept|Pincode|Salary|Age|Bonus|
# +-------+-----------+-------+------+---+-----+
# | 1| HR| 111| 1000| 45| 100|
# | 2| Sales| 596| 500| 30| 50|
# | 3|Manufacture| 895| 600| 50| 400|
# | 4| HR| 212| 700| 26| 60|
# | 5| Business| 754| 350| 18| 22|
# +-------+-----------+-------+------+---+-----+
percent_rank works in a way that the smallest value gets percentile 0 and the biggest value gets 1.
arr = ['Salary', 'Age', 'Bonus']
df = df.select(
*[c for c in df.columns if c not in arr],
*[F.percent_rank().over(W.orderBy(c)).alias(c) for c in arr]
).sort('Empl_No')
df.show()
# +-------+-----------+-------+------+----+-----+
# |Empl_No| Dept|Pincode|Salary| Age|Bonus|
# +-------+-----------+-------+------+----+-----+
# | 1| HR| 111| 1.0|0.75| 0.75|
# | 2| Sales| 596| 0.25| 0.5| 0.25|
# | 3|Manufacture| 895| 0.5| 1.0| 1.0|
# | 4| HR| 212| 0.75|0.25| 0.5|
# | 5| Business| 754| 0.0| 0.0| 0.0|
# +-------+-----------+-------+------+----+-----+
However, your expectation is somewhat different. You expect it to assume 0 as the smallest value even though it does not exist in the columns.
To solve this I will add a row with 0 values and later it will be deleted.
arr = ['Salary', 'Age', 'Bonus']
# Adding a row containing 0 values
df = df.limit(1).withColumn('Dept', F.lit('_tmp')).select(
*[c for c in df.columns if c not in arr],
*[F.lit(0).alias(c) for c in arr]
).union(df)
# Calculating percentiles
df = df.select(
*[c for c in df.columns if c not in arr],
*[F.percent_rank().over(W.orderBy(c)).alias(c) for c in arr]
).sort('Empl_No')
# Removing the fake row
df = df.filter("Dept != '_tmp'")
df.show()
# +-------+-----------+-------+------+---+-----+
# |Empl_No| Dept|Pincode|Salary|Age|Bonus|
# +-------+-----------+-------+------+---+-----+
# | 1| HR| 111| 1.0|0.8| 0.8|
# | 2| Sales| 596| 0.4|0.6| 0.4|
# | 3|Manufacture| 895| 0.6|1.0| 1.0|
# | 4| HR| 212| 0.8|0.4| 0.6|
# | 5| Business| 754| 0.2|0.2| 0.2|
# +-------+-----------+-------+------+---+-----+
You can multiply the percentile by 100 if you like:
*[(100 * F.percent_rank().over(W.orderBy(c))).alias(c) for c in arr]
Then you get...
+-------+-----------+-------+------+-----+-----+
|Empl_No| Dept|Pincode|Salary| Age|Bonus|
+-------+-----------+-------+------+-----+-----+
| 1| HR| 111| 100.0| 80.0| 80.0|
| 2| Sales| 596| 40.0| 60.0| 40.0|
| 3|Manufacture| 895| 60.0|100.0|100.0|
| 4| HR| 212| 80.0| 40.0| 60.0|
| 5| Business| 754| 20.0| 20.0| 20.0|
+-------+-----------+-------+------+-----+-----+

Dynamically add padding Zeros

mock_data = [('TYCO', ' 1303','13'),('EMC', ' 120989 ','123'), ('VOLVO ', '102329 ','1234'),('BMW', '1301571345 ',' '),('FORD', '004','21212')]
df = spark.createDataFrame(mock_data, ['col1', 'col2','col3'])
+-------+------------+-----+
| col1 | col2| col3|
+-------+------------+-----+
| TYCO| 1303| 13|
| EMC| 120989 | 123|
|VOLVO | 102329 | 1234|
| BMW|1301571345 | |
| FORD| 004|21212|
+-------+------------+-----+
trim the col2 and based on the length(10-col2 length) need to dynamically add padding zeroes in col3. concatenate col2 and col3.
df2 = df.withColumn('length_col2', 10-length(trim(df.col2)))
+-------+------------+-----+-----------+
| col1| col2| col3|length_col2|
+-------+------------+-----+-----------+
| TYCO| 1303| 13| 6|
| EMC| 120989 | 123| 4|
|VOLVO | 102329 | 1234| 4|
| BMW|1301571345 | | 0|
| FORD| 004|21212| 7|
+-------+------------+-----+-----------+
expected output
+-------+----------+-----+-------------
| col1| col2 | col3|output
+-------+----------+-----+-------------
| TYCO| 1303 | 13|1303000013
| EMC| 120989 | 123|1209890123
|VOLVO | 102329 | 1234|1023291234
| BMW| 1301571345 | |1301571345
| FORD| 004 |21212|0040021212
+-------+----------+-----+-------------
What You are looking for is rpad Function in pyspark.sql.functions as listed here => https://spark.apache.org/docs/2.3.0/api/sql/index.html
See The Solution Below :
%pyspark
mock_data = [('TYCO', ' 1303','13'),('EMC', ' 120989 ','123'), ('VOLVO ', '102329 ','1234'),('BMW', '1301571345 ',' '),('FORD', '004','21212')]
df = spark.createDataFrame(mock_data, ['col1', 'col2','col3'])
df.createOrReplaceTempView("input_df")
spark.sql("SELECT *, concat(rpad(trim(col2),10,'0') , col3) as OUTPUT from input_df").show(20,False)
and Result
+-------+------------+-----+---------------+
|col1 |col2 |col3 |OUTPUT |
+-------+------------+-----+---------------+
|TYCO | 1303 |13 |130300000013 |
|EMC | 120989 |123 |1209890000123 |
|VOLVO |102329 |1234 |10232900001234 |
|BMW |1301571345 | |1301571345 |
|FORD |004 |21212|004000000021212|
+-------+------------+-----+---------------+

Pyspark: pivot month level records into a single column based quarter level record

I have a dataframe in the form:
+---------+-------+------------+------------------+--------+-------+
| quarter | month | month_rank | unique_customers | units | sales |
+---------+-------+------------+------------------+--------+-------+
-
| 1 | 1 | 1 | 15 | 30 | 1000 |
--------------------------------------------------------------------
| 1 | 2 | 2 | 20 | 35 | 1200 |
--------------------------------------------------------------------
| 1 | 3 | 3 | 18 | 40 | 1500 |
--------------------------------------------------------------------
| 2 | 4 | 1 | 10 | 25 | 800 |
--------------------------------------------------------------------
| 2 | 5 | 2 | 25 | 50 | 2000 |
--------------------------------------------------------------------
| 2 | 6 | 3 | 28 | 45 | 1800 |
...
I am trying to group on quarter and track the monthly sales in a columnar fashion such as the following:
+---------+--------------+------------+------------------+--------+-------+
| quarter | month_rank1 | rank1_unique_customers | rank1_units | rank1_sales | month_rank2 | rank2_unique_customers | rank2_units | rank2_sales | month_rank3 | rank3_unique_customers | rank3_units | rank3_sales |
+---------+--------------+------------+------------------+--------+-------+
| 1 | 1 | 15|30|1000| 2 |20|35|1200 | 3 |18|40|1500
---------------------------------------------------------------------
| 2 | 4 | 10|25|800 | 5 |25|50|2000 | 6 |28|45|1800
---------------------------------------------------------------------
Is this achievable with multiple pivots? I have had no luck creating multiple columns from a pivot. I am thinking I might be able to achieve this result with windowing, but if anyone has run into a similar problem any suggestions would be greatly appriciated. Thank you!
Use pivot on month_rank column then agg other columns.
Example:
df=spark.createDataFrame([(1,1,1,15,30,1000),(1,2,2,20,35,1200),(1,3,3,18,40,1500),(2,4,1,10,25,800),(2,5,2,25,50,2000),(2,6,3,28,45,1800)],["quarter","month","month_rank","unique_customers","units","sales"])
df.show()
#+-------+-----+----------+----------------+-----+-----+
#|quarter|month|month_rank|unique_customers|units|sales|
#+-------+-----+----------+----------------+-----+-----+
#| 1| 1| 1| 15| 30| 1000|
#| 1| 2| 2| 20| 35| 1200|
#| 1| 3| 3| 18| 40| 1500|
#| 2| 4| 1| 10| 25| 800|
#| 2| 5| 2| 25| 50| 2000|
#| 2| 6| 3| 28| 45| 1800|
#+-------+-----+----------+----------------+-----+-----+
from pyspark.sql.functions import *
df1=df.\
groupBy("quarter").\
pivot("month_rank").\
agg(first(col("month")),first(col("unique_customers")),first(col("units")),first(col("sales")))
cols=["quarter","month_rank1","rank1_unique_customers","rank1_units","rank1_sales","month_rank2","rank2_unique_customers","rank2_units","rank2_sales","month_rank3","rank3_unique_customers","rank3_units","rank3_sales"]
df1.toDF(*cols).show()
#+-------+-----------+----------------------+-----------+-----------+-----------+----------------------+-----------+-----------+-----------+----------------------+-----------+-----------+
#|quarter|month_rank1|rank1_unique_customers|rank1_units|rank1_sales|month_rank2|rank2_unique_customers|rank2_units|rank2_sales|month_rank3|rank3_unique_customers|rank3_units|rank3_sales|
#+-------+-----------+----------------------+-----------+-----------+-----------+----------------------+-----------+-----------+-----------+----------------------+-----------+-----------+
#| 1| 1| 15| 30| 1000| 2| 20| 35| 1200| 3| 18| 40| 1500|
#| 2| 4| 10| 25| 800| 5| 25| 50| 2000| 6| 28| 45| 1800|
#+-------+-----------+----------------------+-----------+-----------+-----------+----------------------+-----------+-----------+-----------+----------------------+-----------+-----------+

Expand last value of string column to groupby Pandas Dataframe

I have the following Pandas dataframe:
+--------+----+
|id |name|
+--------+----+
| 1| |
| 1| |
| 1| |
| 1|Carl|
| 2| |
| 2| |
| 2|John|
+--------+----+
What I want to achieve is to expand the last value of each group to the rest of the group:
+--------+----+
|id |name|
+--------+----+
| 1|Carl|
| 1|Carl|
| 1|Carl|
| 1|Carl|
| 2|John|
| 2|John|
| 2|John|
+--------+----+
It looks pretty easy but I am struggling to achieve it because of the columns' type.
What I've tried so far is:
df['name'] = df.groupby('id')['name'].transform('last')
This works for int or float columns, but not for string columns.
I am getting the following error:
No numeric types to aggregate
Thanks in advance.
Edit
bfill() is not valid because I can have the following:
+--------+----+
|id |name|
+--------+----+
| 1| |
| 1| |
| 1| |
| 1|Carl|
| 2| |
| 2| |
| 2| |
| 3| |
| 3| |
| 3|John|
+--------+----+
In this case, I want id = 2 to remain as NaN, and it would end up as John, which is incorrect. The desired output would be:
+--------+----+
|id |name|
+--------+----+
| 1|Carl|
| 1|Carl|
| 1|Carl|
| 1|Carl|
| 2| |
| 2| |
| 2| |
| 3|John|
| 3|John|
| 3|John|
+--------+----+
If the empty values are NaN, could you try fillna
df['name'] = df['name'].bfill()
If not, replace empty strings by NaN.
Try this.
import pandas as pd
import numpy as np
dff = pd.DataFrame({"id":[1,1,1,1,2,2,2,3,3,3],
"name":["","","","car1","","","","","","john"]})
dff = dff.replace(r'', np.NaN)
def c(x):
if sum(pd.isnull(x)) != np.size(x):
l = [v for v in x if type(v) == str]
return [l[0]]*np.size(x)
else:
return [""]*np.size(x)
df=dff.groupby('id')["name"].apply(lambda x:c(list(x)))
df = df.to_frame().reset_index()
df = df.set_index('id').name.apply(pd.Series).stack().reset_index(level=0).rename(columns={0:'name'})
output
id name
0 1 car1
1 1 car1
2 1 car1
3 1 car1
0 2
1 2
2 2
0 3 john
1 3 john
2 3 john

PySpark - Select rows where the column has non-consecutive values after grouping

I have a dataframe of the form:
|user_id| action | day |
------------------------
| d25as | AB | 2 |
| d25as | AB | 3 |
| d25as | AB | 5 |
| m3562 | AB | 1 |
| m3562 | AB | 7 |
| m3562 | AB | 9 |
| ha42a | AB | 3 |
| ha42a | AB | 4 |
| ha42a | AB | 5 |
I want to filter out users that are seen on consecutive days, if they are not seen in at least a single nonconsecutive day. The resulting dataframe should be:
|user_id| action | day |
------------------------
| d25as | AB | 2 |
| d25as | AB | 3 |
| d25as | AB | 5 |
| m3562 | AB | 1 |
| m3562 | AB | 7 |
| m3562 | AB | 9 |
where the last user has been removed, since he appeared just on consecutive days.
Does anyone know how this can be done in spark?
Using spark-sql window functions and without any udfs. The df construction is done in scala but the sql part will be same in python. Check this out:
val df = Seq(("d25as","AB",2),("d25as","AB",3),("d25as","AB",5),("m3562","AB",1),("m3562","AB",7),("m3562","AB",9),("ha42a","AB",3),("ha42a","AB",4),("ha42a","AB",5)).toDF("user_id","action","day")
df.createOrReplaceTempView("qubix")
spark.sql(
""" with t1( select user_id, action, day, row_number() over(partition by user_id order by day)-day diff from qubix),
t2( select user_id, action, day, collect_set(diff) over(partition by user_id) diff2 from t1)
select user_id, action, day from t2 where size(diff2) > 1
""").show(false)
Results:
+-------+------+---+
|user_id|action|day|
+-------+------+---+
|d25as |AB |2 |
|d25as |AB |3 |
|d25as |AB |5 |
|m3562 |AB |1 |
|m3562 |AB |7 |
|m3562 |AB |9 |
+-------+------+---+
pyspark version
>>> from pyspark.sql.functions import *
>>> values = [('d25as','AB',2),('d25as','AB',3),('d25as','AB',5),
... ('m3562','AB',1),('m3562','AB',7),('m3562','AB',9),
... ('ha42a','AB',3),('ha42a','AB',4),('ha42a','AB',5)]
>>> df = spark.createDataFrame(values,['user_id','action','day'])
>>> df.show()
+-------+------+---+
|user_id|action|day|
+-------+------+---+
| d25as| AB| 2|
| d25as| AB| 3|
| d25as| AB| 5|
| m3562| AB| 1|
| m3562| AB| 7|
| m3562| AB| 9|
| ha42a| AB| 3|
| ha42a| AB| 4|
| ha42a| AB| 5|
+-------+------+---+
>>> df.createOrReplaceTempView("qubix")
>>> spark.sql(
... """ with t1( select user_id, action, day, row_number() over(partition by user_id order by day)-day diff from qubix),
... t2( select user_id, action, day, collect_set(diff) over(partition by user_id) diff2 from t1)
... select user_id, action, day from t2 where size(diff2) > 1
... """).show()
+-------+------+---+
|user_id|action|day|
+-------+------+---+
| d25as| AB| 2|
| d25as| AB| 3|
| d25as| AB| 5|
| m3562| AB| 1|
| m3562| AB| 7|
| m3562| AB| 9|
+-------+------+---+
>>>
Read the comments in between. The code will be self explanatory then.
from pyspark.sql.functions import udf, collect_list, explode
#Creating the DataFrame
values = [('d25as','AB',2),('d25as','AB',3),('d25as','AB',5),
('m3562','AB',1),('m3562','AB',7),('m3562','AB',9),
('ha42a','AB',3),('ha42a','AB',4),('ha42a','AB',5)]
df = sqlContext.createDataFrame(values,['user_id','action','day'])
df.show()
+-------+------+---+
|user_id|action|day|
+-------+------+---+
| d25as| AB| 2|
| d25as| AB| 3|
| d25as| AB| 5|
| m3562| AB| 1|
| m3562| AB| 7|
| m3562| AB| 9|
| ha42a| AB| 3|
| ha42a| AB| 4|
| ha42a| AB| 5|
+-------+------+---+
# Grouping together the days in one list.
df = df.groupby(['user_id','action']).agg(collect_list('day'))
df.show()
+-------+------+-----------------+
|user_id|action|collect_list(day)|
+-------+------+-----------------+
| ha42a| AB| [3, 4, 5]|
| m3562| AB| [1, 7, 9]|
| d25as| AB| [2, 3, 5]|
+-------+------+-----------------+
# Creating a UDF to check if the days are consecutive or not. Only keep False ones.
check_consecutive = udf(lambda row: sorted(row) == list(range(min(row), max(row)+1)))
df = df.withColumn('consecutive',check_consecutive(col('collect_list(day)')))\
.where(col('consecutive')==False)
df.show()
+-------+------+-----------------+-----------+
|user_id|action|collect_list(day)|consecutive|
+-------+------+-----------------+-----------+
| m3562| AB| [1, 7, 9]| false|
| d25as| AB| [2, 3, 5]| false|
+-------+------+-----------------+-----------+
# Finally, exploding the DataFrame from above to get the result.
df = df.withColumn("day", explode(col('collect_list(day)')))\
.drop('consecutive','collect_list(day)')
df.show()
+-------+------+---+
|user_id|action|day|
+-------+------+---+
| m3562| AB| 1|
| m3562| AB| 7|
| m3562| AB| 9|
| d25as| AB| 2|
| d25as| AB| 3|
| d25as| AB| 5|
+-------+------+---+

Resources