How to delete redundant values based on other value? - apache-spark

In the below dataframe, there are several apartments with different job's:
+---+---------+------+
|id |apartment|job |
+---+---------+------+
|1 |Ap1 |dev |
|2 |Ap1 |anyl |
|3 |Ap2 |dev |
|4 |Ap2 |anyl |
|5 |Ap2 |anyl |
|6 |Ap2 |dev |
|7 |Ap2 |dev |
|8 |Ap2 |dev |
|9 |Ap3 |anyl |
|10 |Ap3 |dev |
|11 |Ap3 |dev |
+---+---------+------+
For each apartment, the number of rows with job='dev' should be equal to the number of rows with job='anyl' (like for Ap1). How to delete the redundant rows with 'dev' in all the apartments?
The expected result:
+---+---------+------+
|id |apartment|job |
+---+---------+------+
|1 |Ap1 |dev |
|2 |Ap1 |anyl |
|3 |Ap2 |dev |
|4 |Ap2 |anyl |
|5 |Ap2 |anyl |
|6 |Ap2 |dev |
|9 |Ap3 |anyl |
|10 |Ap3 |dev |
+---+---------+------+
I guess I should use Window functions to deal with that, but I couldn't figure it out.

I think you first need to find out how many 'anyl' do you have for every 'apartment' and then use it to delete all the excess 'dev'. So, first, aggregation, then join and then window function row_number before you can filter out what you don't need.
Setup:
from pyspark.sql import functions as F, Window as W
df = spark.createDataFrame(
[(1, 'Ap1', 'dev'),
(2, 'Ap1', 'anyl'),
(3, 'Ap2', 'dev'),
(4, 'Ap2', 'anyl'),
(5, 'Ap2', 'anyl'),
(6, 'Ap2', 'dev'),
(7, 'Ap2', 'dev'),
(8, 'Ap2', 'dev'),
(9, 'Ap3', 'anyl'),
(10, 'Ap3', 'dev'),
(11, 'Ap3', 'dev')],
['id', 'apartment', 'job']
)
Script:
df_grp = df.filter(F.col('job') == 'anyl').groupBy('apartment').count()
df = df.join(df_grp, 'apartment', 'left')
w = W.partitionBy('apartment', 'job').orderBy('id')
df = df.withColumn('_rn', F.row_number().over(w))
df = df.filter('_rn <= count')
df = df.select('id', 'apartment', 'job')
df.show()
# +---+---------+----+
# | id|apartment| job|
# +---+---------+----+
# | 2| Ap1|anyl|
# | 1| Ap1| dev|
# | 4| Ap2|anyl|
# | 5| Ap2|anyl|
# | 3| Ap2| dev|
# | 6| Ap2| dev|
# | 9| Ap3|anyl|
# | 10| Ap3| dev|
# +---+---------+----+

Using a left semijoin instead of groupBy+filter combo suggested by #ZygD might be more efficient:
>>> from pyspark.sql import Window
>>> from pyspark.sql.functions import *
>>> df1 = df.withColumn('rn', row_number().over(Window.partitionBy('apartment', 'job').orderBy('id')))
>>> df2 = df1.join(df1.alias('dfa').where("job='anyl'"),(df1.apartment==dfa.apartment)&(df1.rn==dfa.rn),'leftsemi')
>>> df2.show(truncate=False)
+---+---------+----+---+
|id |apartment|job |rn |
+---+---------+----+---+
|1 |Ap1 |dev |1 |
|2 |Ap1 |anyl|1 |
|3 |Ap2 |dev |1 |
|4 |Ap2 |anyl|1 |
|5 |Ap2 |anyl|2 |
|6 |Ap2 |dev |2 |
|9 |Ap3 |anyl|1 |
|10 |Ap3 |dev |1 |
+---+---------+----+---+

Related

pyspark applying odm mapping on column level

I have below 2 data frames and i would like to apply similar condition and return the values in pyspark data frames.
df1.show()
+---+-------+--------+
|id |tr_type|nominal |
+---+-------+--------+
|1 |K |2.0 |
|2 |ZW |7.0 |
|3 |V |12.5 |
|4 |VW |9.0 |
|5 |CI |5.0 |
+---+-------+--------+
One dimensional mapping:
*abcefgh
+-------+------------+------------+-----------+
|odm_id |return_value|odm_relation|input_value|
+-------+------------+------------+-----------+
|abcefgh|B |EQ |K |
|abcefgh|B |EQ |ZW |
|abcefgh|S |EQ |V |
|abcefgh|S |EQ |VW |
|abcefgh|I |EQ |CI |
+-------+------------+------------+-----------+
I need to apply below condition The nominal volume is negated when there is a sell transaction.
IF (tr_type, $abcefgh.) == 'S' THEN ;
nominal = -nominal ;
The expected output:
+---+-------+-------+-----------+
|id |tr_type|nominal|nominal_new|
+---+-------+-------+-----------+
|1 |K |2.0 |2.0 |
|2 |ZW |7.0 |7.0 |
|3 |V |12.5 |-12.5 |
|4 |VW |9.0 |-9.0 |
|5 |CI |5.0 |5.0 |
+---+-------+-------+-----------+
you could join the 2 dataframes on tr_type == input_value and use a when().otherwise() to create the new column.
see example below using your samples
data_sdf. \
join(odm_sdf.selectExpr('return_value', 'input_value as tr_type').
dropDuplicates(),
['tr_type'],
'left'
). \
withColumn('nominal_new',
func.when(func.col('return_value') == 'S', func.col('nominal') * -1).
otherwise(func.col('nominal'))
). \
drop('return_value'). \
show()
# +-------+---+-------+-----------+
# |tr_type| id|nominal|nominal_new|
# +-------+---+-------+-----------+
# | K| 1| 2.0| 2.0|
# | CI| 5| 5.0| 5.0|
# | V| 3| 12.5| -12.5|
# | VW| 4| 9.0| -9.0|
# | ZW| 2| 7.0| 7.0|
# +-------+---+-------+-----------+

Creating Total and percentage of total columns in Pyspark

Here is my test data
test = spark.createDataFrame([
("2018-06-03",2, 4, 4 ),
("2018-06-04",4, 3, 3 ),
( "2018-06-03",8, 1, 1),
("2018-06-01",3, 1, 1),
( "2018-06-05", 3, 2, 0),
])\
.toDF( "transactiondate", "SalesA", "SalesB","SalesC")
test.show()
I would like to add a row-wise total column and % of the total column corresponding to each sales category (A, B and C)
Desired Output:
+---------------+------+------+------+----------+------+------+------+
|transactiondate|SalesA|SalesB|SalesC|TotalSales|Perc_A|Perc_B|Perc_C|
+---------------+------+------+------+----------+------+------+------+
| 2018-06-03| 2| 4| 4| 10| 0.2| 0.4| 0.4|
| 2018-06-04| 4| 3| 3| 10| 0.4| 0.3| 0.3|
| 2018-06-03| 8| 1| 1| 10| 0.8| 0.1| 0.1|
| 2018-06-01| 3| 1| 1| 5| 0.6| 0.2| 0.2|
| 2018-06-05| 3| 2| 0| 5| 0.6| 0.4| 0.0|
+---------------+------+------+------+----------+------+------+------+
How can I do it in pyspark?
Edit: I want the code to be adaptable even if I add more items, i.e. if I have one more column salesD, code should create total and percentage columns. (i.e. columns shouldn't be hardcoded)
You can use selectExpr and do simple arithmetic SQL operations for each added columns
test = test.selectExpr("*",
"SalesA+SalesB+SalesC as TotalSales",
"SalesA/(SalesA+SalesB+SalesC) as Perc_A",
"SalesB/(SalesA+SalesB+SalesC) as Perc_B",
"SalesC/(SalesA+SalesB+SalesC) as Perc_C"
)
or use a more flexible solution
from pyspark.sql.functions import col, expr
# columns to be included in TotalSales calculation
cols = ['SalesA', 'SalesB', 'SalesC']
test = (test
.withColumn('TotalSales', expr('+'.join(cols)))
.select(col('*'),
*[expr('{0}/TotalSales {1}'.format(c,'Perc_'+c)) for c in cols]))
One option is to use several withColumn statements
import pyspark.sql.functions as F
test\
.withColumn('TotalSales', F.col('SalesA') + F.col('SalesB') + F.col('SalesC'))\
.withColumn('Perc_A', F.col('SalesA') / F.col('TotalSales'))\
.withColumn('Perc_B', F.col('SalesB') / F.col('TotalSales'))\
.withColumn('Perc_C', F.col('SalesC') / F.col('TotalSales'))
Try this spark-sql solution
test.createOrReplaceTempView("sales_table")
sales=[ x for x in test.columns if x.upper().startswith("SALES") ]
sales2="+".join(sales)
print(str(sales)) # ['SalesA', 'SalesB', 'SalesC']
per_sales=[ x +"/TotalSales as " + "Perc_" +x for x in sales ]
per_sales2=",".join(per_sales)
print(str(per_sales)) # ['SalesA/TotalSales as Perc_SalesA', 'SalesB/TotalSales as Perc_SalesB', 'SalesC/TotalSales as Perc_SalesC']
spark.sql(f"""
with t1 ( select *, {sales2} TotalSales from sales_table )
select *, {per_sales2} from t1
""").show(truncate=False)
+---------------+------+------+------+----------+-----------+-----------+-----------+
|transactiondate|SalesA|SalesB|SalesC|TotalSales|Perc_SalesA|Perc_SalesB|Perc_SalesC|
+---------------+------+------+------+----------+-----------+-----------+-----------+
|2018-06-03 |2 |4 |4 |10 |0.2 |0.4 |0.4 |
|2018-06-04 |4 |3 |3 |10 |0.4 |0.3 |0.3 |
|2018-06-03 |8 |1 |1 |10 |0.8 |0.1 |0.1 |
|2018-06-01 |3 |1 |1 |5 |0.6 |0.2 |0.2 |
|2018-06-05 |3 |2 |0 |5 |0.6 |0.4 |0.0 |
+---------------+------+------+------+----------+-----------+-----------+-----------+
You can also use the aggregate() higher order function to sum the sales* columns. But for this the columns must be of Integer/double type, not long.
test2=test.withColumn("SalesA",expr("cast(salesa as int)"))\
.withColumn("SalesB",expr("cast(salesb as int)"))\
.withColumn("SalesC",expr("cast(salesc as int)"))
test2.createOrReplaceTempView("sales_table2")
sales3=",".join(sales) # just join the sales columns with comma
spark.sql(f"""
with t1 ( select *, aggregate(array({sales3}),0,(acc,x) -> acc+x) TotalSales from sales_table2 )
select *, {per_sales2} from t1
""").show(truncate=False)
+---------------+------+------+------+----------+-----------+-----------+-----------+
|transactiondate|SalesA|SalesB|SalesC|TotalSales|Perc_SalesA|Perc_SalesB|Perc_SalesC|
+---------------+------+------+------+----------+-----------+-----------+-----------+
|2018-06-03 |2 |4 |4 |10 |0.2 |0.4 |0.4 |
|2018-06-04 |4 |3 |3 |10 |0.4 |0.3 |0.3 |
|2018-06-03 |8 |1 |1 |10 |0.8 |0.1 |0.1 |
|2018-06-01 |3 |1 |1 |5 |0.6 |0.2 |0.2 |
|2018-06-05 |3 |2 |0 |5 |0.6 |0.4 |0.0 |
+---------------+------+------+------+----------+-----------+-----------+-----------+

Can we reorder spark dataframe's columns?

I am creating dataframe as per given schema, after that i want to create new dataframe by reordering the existing dataframe.
Can it be possible the re-ordering of columns in spark dataframe?
object Demo extends Context {
def main(args: Array[String]): Unit = {
val emp = Seq((1,"Smith",-1,"2018","10","M",3000),
(2,"Rose",1,"2010","20","M",4000),
(3,"Williams",1,"2010","10","M",1000),
(4,"Jones",2,"2005","10","F",2000),
(5,"Brown",2,"2010","40","",-1),
(6,"Brown",2,"2010","50","",-1)
)
val empColumns = Seq("emp_id","name","superior_emp_id","year_joined",
"emp_dept_id","gender","salary")
import sparkSession.sqlContext.implicits._
val empDF = emp.toDF(empColumns: _*)
empDF.show(false)
}
}
Current DF:
+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1 |Smith |-1 |2018 |10 |M |3000 |
|2 |Rose |1 |2010 |20 |M |4000 |
|3 |Williams|1 |2010 |10 |M |1000 |
|4 |Jones |2 |2005 |10 |F |2000 |
|5 |Brown |2 |2010 |40 | |-1 |
|6 |Brown |2 |2010 |50 | |-1 |
+------+--------+---------------+-----------+-----------+------+------+
I want output as this following df, where gender and salary column re-ordered
New DF:
+------+--------+------+------+---------------+-----------+-----------+
|emp_id|name |gender|salary|superior_emp_id|year_joined|emp_dept_id|
+------+--------+------+------+---------------+-----------+-----------+
|1 |Smith |M |3000 |-1 |2018 |10 |
|2 |Rose |M |4000 |1 |2010 |20 |
|3 |Williams|M |1000 |1 |2010 |10 |
|4 |Jones |F |2000 |2 |2005 |10 |
|5 |Brown | |-1 |2 |2010 |40 |
|6 |Brown | |-1 |2 |2010 |50 |
+------+--------+------+------+---------------+-----------+-----------+
Just use select() to re-order the columns:
df = df.select('emp_id','name','gender','salary','superior_emp_id','year_joined','emp_dept_id')
It will be shown according to your ordering in select() argument.
Scala way of doing it
//Order the column names as you want
val columns = Array("emp_id","name","gender","salary","superior_emp_id","year_joined","emp_dept_id")
.map(col)
//Pass it to select
df.select(columns: _*)

Spark Window function has sliding window behavior when it is ordered

I have a dataset which looks like this:
+---+-------------------------------+--------+
|key|value |someData|
+---+-------------------------------+--------+
|1 |AAA |5 |
|1 |VVV |6 |
|1 |DDDD |8 |
|3 |rrerw |9 |
|4 |RRRRR |13 |
|6 |AAAAABB |15 |
|6 |C:\Windows\System32\svchost.exe|20 |
+---+-------------------------------+--------+
Now, I apply aggregative avg function twice, first over ordered Window, later on unordered window, the results are not the same example:
WindowSpec windowSpec = Window.orderBy(col("someData")).partitionBy(col("key"));
rawMapping.withColumn("avg", avg("someData").over(windowSpec)).show(false);
+---+-------------------------------+--------+-----------------+
|key|value |someData|avg |
+---+-------------------------------+--------+-----------------+
|1 |AAA |5 |5.0 |
|1 |VVV |6 |5.5 |
|1 |DDDD |8 |6.333333333333333|
|6 |AAAAABB |15 |15.0 |
|6 |C:\Windows\System32\svchost.exe|20 |17.5 |
|3 |rrerw |9 |9.0 |
|4 |RRRRR |13 |13.0 |
+---+-------------------------------+--------+-----------------+
WindowSpec windowSpec2 = Window.partitionBy(col("key"));
rawMapping.withColumn("avg", avg("someData").over(windowSpec2)).show(false);
+---+-------------------------------+--------+-----------------+
|key|value |someData|avg |
+---+-------------------------------+--------+-----------------+
|1 |AAA |5 |6.333333333333333|
|1 |VVV |6 |6.333333333333333|
|1 |DDDD |8 |6.333333333333333|
|6 |AAAAABB |15 |17.5 |
|6 |C:\Windows\System32\svchost.exe|20 |17.5 |
|3 |rrerw |9 |9.0 |
|4 |RRRRR |13 |13.0 |
+---+-------------------------------+--------+-----------------+
When the window is oredered, the aggregative function has a "sliding window" behavior, why is this happening? and more importantly, is it a bug or a feature?

Pyspark : Cumulative Sum with reset condition

We have dataframe like below :
+------+--------------------+
| Flag | value|
+------+--------------------+
|1 |5 |
|1 |4 |
|1 |3 |
|1 |5 |
|1 |6 |
|1 |4 |
|1 |7 |
|1 |5 |
|1 |2 |
|1 |3 |
|1 |2 |
|1 |6 |
|1 |9 |
+------+--------------------+
After normal cumsum we get this.
+------+--------------------+----------+
| Flag | value|cumsum |
+------+--------------------+----------+
|1 |5 |5 |
|1 |4 |9 |
|1 |3 |12 |
|1 |5 |17 |
|1 |6 |23 |
|1 |4 |27 |
|1 |7 |34 |
|1 |5 |39 |
|1 |2 |41 |
|1 |3 |44 |
|1 |2 |46 |
|1 |6 |52 |
|1 |9 |61 |
+------+--------------------+----------+
Now what we want is for cumsum to reset when specific condition is set for ex. when it crosses 20.
Below is expected output:
+------+--------------------+----------+---------+
| Flag | value|cumsum |expected |
+------+--------------------+----------+---------+
|1 |5 |5 |5 |
|1 |4 |9 |9 |
|1 |3 |12 |12 |
|1 |5 |17 |17 |
|1 |6 |23 |23 |
|1 |4 |27 |4 | <-----reset
|1 |7 |34 |11 |
|1 |5 |39 |16 |
|1 |2 |41 |18 |
|1 |3 |44 |21 |
|1 |2 |46 |2 | <-----reset
|1 |6 |52 |8 |
|1 |9 |61 |17 |
+------+--------------------+----------+---------+
This is how we are calculating the cumulative sum.
win_counter = Window.partitionBy("flag")
df_partitioned = df_partitioned.withColumn('cumsum',F.sum(F.col('value')).over(win_counter))
There are two ways I've found to solve it without udf:
Dataframe
from pyspark.sql.window import Window
import pyspark.sql.functions as f
df = spark.createDataFrame([
(1, 5), (1, 4), (1, 3), (1, 5), (1, 6), (1, 4),
(1, 7), (1, 5), (1, 2), (1, 3), (1, 2), (1, 6), (1, 9)
], schema='Flag int, value int')
w = (Window
.partitionBy('flag')
.orderBy(f.monotonically_increasing_id())
.rowsBetween(Window.unboundedPreceding, Window.currentRow))
df = df.withColumn('values', f.collect_list('value').over(w))
expr = "AGGREGATE(values, 0, (acc, el) -> IF(acc < 20, acc + el, el))"
df = df.select('Flag', 'value', f.expr(expr).alias('cumsum'))
df.show(truncate=False)
RDD
df = spark.createDataFrame([
(1, 5), (1, 4), (1, 3), (1, 5), (1, 6), (1, 4),
(1, 7), (1, 5), (1, 2), (1, 3), (1, 2), (1, 6), (1, 9)
], schema='Flag int, value int')
def cumsum_by_flag(rows):
cumsum, reset = 0, False
for row in rows:
if reset:
cumsum = row.value
reset = False
else:
cumsum += row.value
reset = cumsum > 20
yield row.value, cumsum
def unpack(value):
flag = value[0]
value, cumsum = value[1]
return flag, value, cumsum
rdd = df.rdd.keyBy(lambda row: row.Flag)
rdd = (rdd
.groupByKey()
.flatMapValues(cumsum_by_flag)
.map(unpack))
df = rdd.toDF('Flag int, value int, cumsum int')
df.show(truncate=False)
Output:
+----+-----+------+
|Flag|value|cumsum|
+----+-----+------+
|1 |5 |5 |
|1 |4 |9 |
|1 |3 |12 |
|1 |5 |17 |
|1 |6 |23 |
|1 |4 |4 |
|1 |7 |11 |
|1 |5 |16 |
|1 |2 |18 |
|1 |3 |21 |
|1 |2 |2 |
|1 |6 |8 |
|1 |9 |17 |
+----+-----+------+
It's probably best to do with pandas_udf here.
from pyspark.sql.functions import pandas_udf, PandasUDFType
pdf = pd.DataFrame({'flag':[1]*13,'id':range(13), 'value': [5,4,3,5,6,4,7,5,2,3,2,6,9]})
df = spark.createDataFrame(pdf)
df = df.withColumn('cumsum', F.lit(math.inf))
#pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)
def _calc_cumsum(pdf):
pdf.sort_values(by=['id'], inplace=True, ascending=True)
cumsums = []
prev = None
reset = False
for v in pdf['value'].values:
if prev is None:
cumsums.append(v)
prev = v
else:
prev = prev + v if not reset else v
cumsums.append(prev)
reset = True if prev >= 20 else False
pdf['cumsum'] = cumsums
return pdf
df = df.groupby('flag').apply(_calc_cumsum)
df.show()
the results:
+----+---+-----+------+
|flag| id|value|cumsum|
+----+---+-----+------+
| 1| 0| 5| 5.0|
| 1| 1| 4| 9.0|
| 1| 2| 3| 12.0|
| 1| 3| 5| 17.0|
| 1| 4| 6| 23.0|
| 1| 5| 4| 4.0|
| 1| 6| 7| 11.0|
| 1| 7| 5| 16.0|
| 1| 8| 2| 18.0|
| 1| 9| 3| 21.0|
| 1| 10| 2| 2.0|
| 1| 11| 6| 8.0|
| 1| 12| 9| 17.0|
+----+---+-----+------+

Resources