Iterating through rows to create custom formula structure in PySpark - apache-spark

I have a dataframe with variable names and numerator and denominator.
Each variable is a ratio, eg below:
And another dataset with actual data to compute the attributes:
Goal is to create these attributes with formulas in 1st and compute with 2nd.
Currently my approach is very naive:
df = df.withColumn("var1", col('a')/col('b'))./
.
.
.
Desired Output:
Since I have >500 variables, any suggestions for a smarter way to get around this are welcome!

This can be achieved by cross join , unpivot and pivot function in PySpark.
import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql.types import *
data = [
("var1", "a","c"),
("var2", "b","d"),
("var3", "b","a"),
("var4", "d","c")
]
schema = StructType([
StructField('name', StringType(),True), \
StructField('numerator', StringType(),True), \
StructField('denonminator', StringType(),True)
])
data2 = [
("ID1", 6,4,3,7),
("ID2", 1,2,3,9)
]
schema2 = StructType([
StructField('ID', StringType(),True), \
StructField('a', IntegerType(),True), \
StructField('b', IntegerType(),True),\
StructField('c', IntegerType(),True), \
StructField('d', IntegerType(),True)
])
df = spark.createDataFrame(data=data, schema=schema)
df2 = spark.createDataFrame(data=data2, schema=schema2)
df.createOrReplaceTempView("table1")
df2.createOrReplaceTempView("table2")
df.createOrReplaceTempView("table1")
df2.createOrReplaceTempView("table2")
""" CRoss Join for Duplicating the values """
df3=spark.sql("select * from table1 cross join table2")
df3.createOrReplaceTempView("table3")
""" Unpivoting the values and joining to fecth the value of numerator and denominator"""
cols = df2.columns[1:]
df4=df2.selectExpr('ID', "stack({}, {})".format(len(cols), ', '.join(("'{}', {}".format(i, i) for i in cols))))
df4.createOrReplaceTempView("table4")
df5=spark.sql("select name,B.ID,round(B.col1/C.col1,2) as value from table3 A left outer join table4 B on A.ID=B.ID and a.numerator=b.col0 left outer join table4 C on A.ID=C.ID and a.denonminator=C.col0 order by name,ID")
""" Pivot for fetching the results """
df_final=df5.groupBy("ID").pivot("name").max("value")
The results of all intermediate and final dataframes
>>> df.show()
+----+---------+------------+
|name|numerator|denonminator|
+----+---------+------------+
|var1| a| c|
|var2| b| d|
|var3| b| a|
|var4| d| c|
+----+---------+------------+
>>> df2.show()
+---+---+---+---+---+
| ID| a| b| c| d|
+---+---+---+---+---+
|ID1| 6| 4| 3| 7|
|ID2| 1| 2| 3| 9|
+---+---+---+---+---+
>>> df3.show()
+----+---------+------------+---+---+---+---+---+
|name|numerator|denonminator| ID| a| b| c| d|
+----+---------+------------+---+---+---+---+---+
|var1| a| c|ID1| 6| 4| 3| 7|
|var2| b| d|ID1| 6| 4| 3| 7|
|var1| a| c|ID2| 1| 2| 3| 9|
|var2| b| d|ID2| 1| 2| 3| 9|
|var3| b| a|ID1| 6| 4| 3| 7|
|var4| d| c|ID1| 6| 4| 3| 7|
|var3| b| a|ID2| 1| 2| 3| 9|
|var4| d| c|ID2| 1| 2| 3| 9|
+----+---------+------------+---+---+---+---+---+
>>> df4.show()
+---+----+----+
| ID|col0|col1|
+---+----+----+
|ID1| a| 6|
|ID1| b| 4|
|ID1| c| 3|
|ID1| d| 7|
|ID2| a| 1|
|ID2| b| 2|
|ID2| c| 3|
|ID2| d| 9|
+---+----+----+
>>> df5.show()
+----+---+-----+
|name| ID|value|
+----+---+-----+
|var1|ID1| 2.0|
|var1|ID2| 0.33|
|var2|ID1| 0.57|
|var2|ID2| 0.22|
|var3|ID1| 0.67|
|var3|ID2| 2.0|
|var4|ID1| 2.33|
|var4|ID2| 3.0|
+----+---+-----+
>>> df_final.show() final
+---+----+----+----+----+
| ID|var1|var2|var3|var4|
+---+----+----+----+----+
|ID2|0.33|0.22| 2.0| 3.0|
|ID1| 2.0|0.57|0.67|2.33|
+---+----+----+----+----+

Related

Check if a column is consecutive with groupby in pyspark

I have a pyspark dataframe that looks like this:
import pandas as pd
foo = pd.DataFrame({'group': ['a','a','a','b','b','c','c','c'], 'value': [1,2,3,4,5,2,4,5]})
I would like to create a new binary column is_consecutive that indicates if the values in the value column are consecutive by group.
The output should look like this:
foo = pd.DataFrame({'group': ['a','a','a','b','b','c','c','c'], 'value': [1,2,3,4,5,2,4,5],
'is_consecutive': [1,1,1,1,1,0,0,0]})
How could I do that in pyspark?
You can use lag to compare values with the previous row and check if they are consecutive, then use min to determine whether all rows are consecutive in a given group.
from pyspark.sql import functions as F, Window
df2 = df.withColumn(
'consecutive',
F.coalesce(
F.col('value') - F.lag('value').over(Window.partitionBy('group').orderBy('value')) == 1,
F.lit(True)
).cast('int')
).withColumn(
'all_consecutive',
F.min('consecutive').over(Window.partitionBy('group'))
)
df2.show()
+-----+-----+-----------+---------------+
|group|value|consecutive|all_consecutive|
+-----+-----+-----------+---------------+
| c| 2| 1| 0|
| c| 4| 0| 0|
| c| 5| 1| 0|
| b| 4| 1| 1|
| b| 5| 1| 1|
| a| 1| 1| 1|
| a| 2| 1| 1|
| a| 3| 1| 1|
+-----+-----+-----------+---------------+
You can use lead and subtract the same with the existing value then find max of the window, once done , put a condition saying return 0 is max is >1 else return 1
w = Window.partitionBy("group").orderBy(F.monotonically_increasing_id())
(foo.withColumn("Diff",F.lead("value").over(w)-F.col("value"))
.withColumn("is_consecutive",F.when(F.max("Diff").over(w)>1,0).otherwise(1))
.drop("Diff")).show()
+-----+-----+--------------+
|group|value|is_consecutive|
+-----+-----+--------------+
| a| 1| 1|
| a| 2| 1|
| a| 3| 1|
| b| 4| 1|
| b| 5| 1|
| c| 2| 0|
| c| 4| 0|
| c| 5| 0|
+-----+-----+--------------+

Pyspark adding a column of repeating values from a list

I have a pyspark dataframe and want to add a column that adds values from a list in a repeating fashion. If this were just python, I would probably use itertools' cycle function. I don't know how to do this in pyspark.
names = ['Julia', 'Tim', 'Zoe']
My dataframe looks like this:
+-----+------+
| id_A| idx_B|
+-----+------+
| a| 0|
| b| 0|
| b| 2|
| b| 2|
| b| 2|
| b| 2|
+-----+------+
I want it to look like this:
+-----+------+--------+
| id_A| idx_B| names |
+-----+------+--------+
| a| 0| Julia|
| b| 0| Tim|
| b| 2| Zoe|
| b| 2| Julia|
| b| 2| Tim|
| b| 2| Zoe|
+-----+------+--------+
Here's one way.
1 - add a unique incremental id for your dataframe:
df = spark.createDataFrame(
df.rdd.zipWithIndex().map(lambda x: Row(*x[0], x[1]))
).toDF("id_A", "idx_B", "id")
df.show()
#+----+-----+---+
#|id_A|idx_B| id|
#+----+-----+---+
#| a| 0| 0|
#| b| 0| 1|
#| b| 2| 2|
#| b| 2| 3|
#| b| 2| 4|
#| b| 2| 5|
#+----+-----+---+
2 - create dataframe from the list of names:
names_df = spark.createDataFrame([(idx, name) for idx, name in enumerate(names)], ["name_id", "names"])
3 - join using modulo 3 (length of names list) in condition:
from pyspark.sql import functions as F
result = df.join(
names_df,
F.col("id") % 3 == F.col("name_id")
).orderBy("id").drop("id", "name_id")
result.show()
#+----+-----+-----+
#|id_A|idx_B|names|
#+----+-----+-----+
#| a| 0|Julia|
#| b| 0| Tim|
#| b| 2| Zoe|
#| b| 2|Julia|
#| b| 2| Tim|
#| b| 2| Zoe|
#+----+-----+-----+

Drop function doesn't work properly after joining same columns of Dataframe

I am facing this same issue while joining two Data frame A, B.
For ex:
c = df_a.join(df_b, [df_a.col1 == df_b.col1], how="left").drop(df_b.col1)
And when I try to drop the duplicate column like as above this query doesn't drop the col1 of df_b. Instead when I try to drop col1 of df_a, then it able to drop the col1 of df_a.
Could anyone please say about this.
Note: I tried the same in my project which has more than 200 columns and shows the same problem. Sometimes this drop function works properly if we have few columns but not if we have more columns.
Drop function not working after left outer join in pyspark
function to drop duplicates column after merge.
def dropDupeDfCols(df):
newcols = []
dupcols = []
for i in range(len(df.columns)):
if df.columns[i] not in newcols:
newcols.append(df.columns[i])
else:
dupcols.append(i)
df = df.toDF(*[str(i) for i in range(len(df.columns))])
for dupcol in dupcols:
df = df.drop(str(dupcol))
return df.toDF(*newcols)
There are some similar issues I faced recently. Let me show them below with your case.
I am creating two dataframes with the same data
scala> val df_a = Seq((1, 2, "as"), (2,3,"ds"), (3,4,"ew"), (4, 1, "re"), (3,1,"ht")).toDF("a", "b", "c")
df_a: org.apache.spark.sql.DataFrame = [a: int, b: int ... 1 more field]
scala> val df_b = Seq((1, 2, "as"), (2,3,"ds"), (3,4,"ew"), (4, 1, "re"), (3,1,"ht")).toDF("a", "b", "c")
df_b: org.apache.spark.sql.DataFrame = [a: int, b: int ... 1 more field]
Joining them
scala> val df = df_a.join(df_b, df_a("b") === df_b("a"), "leftouter")
df: org.apache.spark.sql.DataFrame = [a: int, b: int ... 4 more fields]
scala> df.show
+---+---+---+---+---+---+
| a| b| c| a| b| c|
+---+---+---+---+---+---+
| 1| 2| as| 2| 3| ds|
| 2| 3| ds| 3| 1| ht|
| 2| 3| ds| 3| 4| ew|
| 3| 4| ew| 4| 1| re|
| 4| 1| re| 1| 2| as|
| 3| 1| ht| 1| 2| as|
+---+---+---+---+---+---+
Let's drop a column that is not present in the above dataframe
+---+---+---+---+---+---+
| a| b| c| a| b| c|
+---+---+---+---+---+---+
| 1| 2| as| 2| 3| ds|
| 2| 3| ds| 3| 1| ht|
| 2| 3| ds| 3| 4| ew|
| 3| 4| ew| 4| 1| re|
| 4| 1| re| 1| 2| as|
| 3| 1| ht| 1| 2| as|
+---+---+---+---+---+---+
Ideally we will expect spark to throw an error, but it executes successfully.
Now, if you drop a column from the above dataframe
scala> df.drop("a").show
+---+---+---+---+
| b| c| b| c|
+---+---+---+---+
| 2| as| 3| ds|
| 3| ds| 1| ht|
| 3| ds| 4| ew|
| 4| ew| 1| re|
| 1| re| 2| as|
| 1| ht| 2| as|
+---+---+---+---+
It drops all the columns with provided column name in the input dataframe.
If you want to drop specific columns, it should be done as below:
scala> df.drop(df_a("a")).show()
+---+---+---+---+---+
| b| c| a| b| c|
+---+---+---+---+---+
| 2| as| 2| 3| ds|
| 3| ds| 3| 1| ht|
| 3| ds| 3| 4| ew|
| 4| ew| 4| 1| re|
| 1| re| 1| 2| as|
| 1| ht| 1| 2| as|
+---+---+---+---+---+
I don't think spark accepts the input as give by you(see below):
scala> df.drop(df_a.a).show()
<console>:30: error: value a is not a member of org.apache.spark.sql.DataFrame
df.drop(df_a.a).show()
^
scala> df.drop(df_a."a").show()
<console>:1: error: identifier expected but string literal found.
df.drop(df_a."a").show()
^
If you provide the input to drop, as below, it executes but will have no impact
scala> df.drop("df_a.a").show
+---+---+---+---+---+---+
| a| b| c| a| b| c|
+---+---+---+---+---+---+
| 1| 2| as| 2| 3| ds|
| 2| 3| ds| 3| 1| ht|
| 2| 3| ds| 3| 4| ew|
| 3| 4| ew| 4| 1| re|
| 4| 1| re| 1| 2| as|
| 3| 1| ht| 1| 2| as|
+---+---+---+---+---+---+
The reason being, spark interprets "df_a.a" as a nested column. As that column is not present ideally it should have thrown error, but as explained above, it just executes.
Hope this helps..!!!

Replacing all column values using Window operation?

Hi Data frame created like below.
df = sc.parallelize([
(1, 3),
(2, 3),
(3, 2),
(4,2),
(1, 3)
]).toDF(["id",'t'])
it shows like below.
+---+---+
| id| t|
+---+---+
| 1| 3|
| 2| 3|
| 3| 2|
| 4| 2|
| 1| 3|
+---+---+
my main aim is ,I want to replace repeated value in every column with how many times repeated.
so i have tried flowing code it is not working as expected.
from pyspark.sql.functions import col
column_list = ["id",'t']
w = Window.partitionBy(column_list)
dfmax=df.select(*((count(col(c)).over(w)).alias(c) for c in df.columns))
dfmax.show()
+---+---+
| id| t|
+---+---+
| 2| 2|
| 2| 2|
| 1| 1|
| 1| 1|
| 1| 1|
+---+---+
my expected output will be
+---+---+
| id| t|
+---+---+
| 2| 3|
| 1| 3|
| 1| 1|
| 1| 1|
| 2| 3|
+---+---+
If I understand you correctly, what you're looking for is simply:
df.select(*[count(c).over(Window.partitionBy(c)).alias(c) for c in df.columns]).show()
#+---+---+
#| id| t|
#+---+---+
#| 2| 3|
#| 2| 3|
#| 1| 2|
#| 1| 3|
#| 1| 2|
#+---+---+
The difference between this and what you posted is that we only partition by one column at a time.
Remember that DataFrames are unordered. If you wanted to maintain your row order, you could add an ordering column using pyspark.sql.functions.monotonically_increasing_id():
from pyspark.sql.functions import monotonically_increasing_id
df.withColumn("order", monotonically_increasing_id())\
.select(*[count(c).over(Window.partitionBy(c)).alias(c) for c in df.columns])\
.sort("order")\
.drop("order")\
.show()
#+---+---+
#| id| t|
#+---+---+
#| 2| 3|
#| 1| 3|
#| 1| 2|
#| 1| 2|
#| 2| 3|
#+---+---+

Difference in dense rank and row number in spark

I tried to understand the difference between dense rank and row number.Each new window partition both is starting from 1. Does rank of a row is not always start from 1 ? Any help would be appreciated
The difference is when there are "ties" in the ordering column. Check the example below:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
val df = Seq(("a", 10), ("a", 10), ("a", 20)).toDF("col1", "col2")
val windowSpec = Window.partitionBy("col1").orderBy("col2")
df
.withColumn("rank", rank().over(windowSpec))
.withColumn("dense_rank", dense_rank().over(windowSpec))
.withColumn("row_number", row_number().over(windowSpec)).show
+----+----+----+----------+----------+
|col1|col2|rank|dense_rank|row_number|
+----+----+----+----------+----------+
| a| 10| 1| 1| 1|
| a| 10| 1| 1| 2|
| a| 20| 3| 2| 3|
+----+----+----+----------+----------+
Note that the value "10" exists twice in col2 within the same window (col1 = "a"). That's when you see a difference between the three functions.
I'm showing #Daniel's answer in Python and I'm adding a comparison with count('*') that can be used if you want to get top-n at most rows per group.
from pyspark.sql.session import SparkSession
from pyspark.sql import Window
from pyspark.sql import functions as F
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([
['a', 10], ['a', 20], ['a', 30],
['a', 40], ['a', 40], ['a', 40], ['a', 40],
['a', 50], ['a', 50], ['a', 60]], ['part_col', 'order_col'])
window = Window.partitionBy("part_col").orderBy("order_col")
df = (df
.withColumn("rank", F.rank().over(window))
.withColumn("dense_rank", F.dense_rank().over(window))
.withColumn("row_number", F.row_number().over(window))
.withColumn("count", F.count('*').over(window))
)
df.show()
+--------+---------+----+----------+----------+-----+
|part_col|order_col|rank|dense_rank|row_number|count|
+--------+---------+----+----------+----------+-----+
| a| 10| 1| 1| 1| 1|
| a| 20| 2| 2| 2| 2|
| a| 30| 3| 3| 3| 3|
| a| 40| 4| 4| 4| 7|
| a| 40| 4| 4| 5| 7|
| a| 40| 4| 4| 6| 7|
| a| 40| 4| 4| 7| 7|
| a| 50| 8| 5| 8| 9|
| a| 50| 8| 5| 9| 9|
| a| 60| 10| 6| 10| 10|
+--------+---------+----+----------+----------+-----+
For example if you want to take at most 4 without randomly picking one of the 4 "40" of the sorting column:
df.where("count <= 4").show()
+--------+---------+----+----------+----------+-----+
|part_col|order_col|rank|dense_rank|row_number|count|
+--------+---------+----+----------+----------+-----+
| a| 10| 1| 1| 1| 1|
| a| 20| 2| 2| 2| 2|
| a| 30| 3| 3| 3| 3|
+--------+---------+----+----------+----------+-----+
In summary, if you filter <= n those columns you will get:
rank at least n rows
dense_rank at least n different order_col values
row_number exactly n rows
count at most n rows

Resources