Let's suppose that we have a dataframe with the following schema
root
|-- AUTHOR_ID: integer (nullable = false)
|-- NAME: string (nullable = true)
|-- Books: array (nullable = false)
| |-- element: struct (containsNull = false)
| | |-- BOOK_ID: integer (nullable = false)
| | |-- Chapters: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- NAME: string (nullable = true)
| | | | |-- NUMBER_PAGES: integer (nullable = true)
As you can see, we nested struct objects
How can we compare two dataframes with the same schema and calculate or bring out the deltas(differences) ?
Let's suppose that the following changes are occured:
the name of the first chapter of book with id=1 was changed, thus we can imagine the following comparison output
{
"AUTHOR_ID": 1,
"Books": [
{
"BOOK_ID": 1,
"Chapters": [
{
"id": 1,
"NAME": {
"before": "Etranger",
"after": "L'étranger"
}
}
]
}
]
}
Note: we will show only the Ids and the changed values for the relevant items
Here is a sample code to join by authorId and then compare.
from pyspark.sql.functions import collect_list, struct, col
from operator import itemgetter
from pyspark.sql.types import ArrayType, StructType, StringType, StructField
# construct data
data = [('AA_1', 'S1', "10", "1", "Introduction to Quadratic Equation"),
('AA_1', 'S1', "10", "2", "Fundamentals"),
('AA_1', 'S1', "11", "1", "Preface"),
('AA_1', 'S1', "11", "2", "Wading in to the waters"),
('AA_2', 'S2', "100", "1", "Introduction"),
('AA_2', 'S2', "100", "2", "Fundamentals"),
('AA_2', 'S2', "110", "1", "Prologue"),
('AA_2', 'S2', "110", "2", "Epilogue"),
]
data2 = [('AA_1', 'S1', "10", "1", "Introduction to Linear Algebra"),
('AA_1', 'S1', "10", "2", "Fundamentals"),
('AA_1', 'S1', "11", "1", "Preface"),
('AA_1', 'S1', "11", "2", "Wading in to the waters"),
('AA_2', 'S2', "100", "1", "Introduction"),
('AA_2', 'S2', "100", "2", "Fundamentals2"),
('AA_2', 'S2', "110", "1", "Prologue"),
('AA_2', 'S2', "110", "2", "Epilogue"),
]
df = spark.createDataFrame(data, ["authorId", "name", "bookId", "chapterId", "chapterName"]).groupBy(['authorId', 'name', 'bookId']).agg(collect_list(struct("chapterId", "chapterName")).alias("chapters")).groupBy(['authorId', 'name']).agg(collect_list(struct('bookId', 'chapters')).alias('books'))
df2 = spark.createDataFrame(data2, ["authorId", "name", "bookId", "chapterId", "chapterName"]).groupBy(['authorId', 'name', 'bookId']).agg(collect_list(struct("chapterId", "chapterName")).alias("chapters")).groupBy(['authorId', 'name']).agg(collect_list(struct('bookId', 'chapters')).alias('books'))
df2 = df2.select(col('authorId').alias('authorId2'),col('name').alias('name2'), col('books').alias('books2') )
# join on authorId
df3 = df.join(df2, [df.authorId == df2.authorId2])
# UDF to compare, needs additional checks on books and chapters lengths and Null checks
#udf(ArrayType(StructType([StructField("bookId", StringType()), StructField("chapters", ArrayType(StructType([StructField("chapterId", StringType()), StructField("name", StructType([StructField("before", StringType()), StructField("after", StringType())]))])))])))
def get_book_diff(b1, b2):
if (len(b1) != len(b2)):
return None
b1.sort(key = itemgetter('bookId'))
b2.sort(key = itemgetter('bookId'))
list_data = []
i=0
for book in b1:
data = {}
if book.bookId == b2[i].bookId:
data['bookId']=book.bookId
book.chapters.sort(key = itemgetter('chapterId'))
b2[i].chapters.sort(key = itemgetter('chapterId'))
data['chapters']=[]
j=0
for chap in book.chapters:
if chap.chapterId == b2[i].chapters[j].chapterId:
if chap.chapterName != b2[i].chapters[j].chapterName:
data['chapters'].append({'chapterId':chap.chapterId, 'name': {"before": chap.chapterName, "after": b2[i].chapters[j].chapterName}})
j+=1
i+=1
list_data.append(data)
return list_data
df3 = df3.withColumn('book_diff', get_book_diff('books', 'books2'))
#df3.select('authorId', 'book_diff').show(truncate=False)
display(df3.select('authorId', 'book_diff'))
I think here the unfortunate requirement is we need to flatten the struct into columns to allow comparison.
import pyspark.sql.functions as F
columns = ["AUTHOR_ID","NAME","Books"] # lazy partial naming
#Original
data = [(1, "James,,Smith",[(1,[(1,"The beggining", 12, "It was a great day")])]), (2, "Stephen King", [(2,[(1,"The start", 12, "It was a great day")])])]
#Update
# Bookid 1 --> added a chapter, fixed a typo in the first chapter.
# Bookid 2 --> Changed nothing
data_after = [(1, "James,,Smith",[(1,[(1,"The begining", 12, "It was a great day"),(2,"The end", 1, "It was a great night")])]), (2, "Stephen King", [(2,[(1,"The start", 12, "It was an a great day")])])]
df = spark.createDataFrame(data=data,schema=columns)
df2 = spark.createDataFrame(data=data_after,schema=columns)
#flatten the struct into columns Could have use withColumn.
df_flat = df.select("*", F.posexplode(F.col("Books")).alias("pos","Book")).select( "*", F.col("Book._1").alias("BookId"), F.posexplode(F.col("Book._2")).alias("pos","Chapter") ).select("*", F.col("Chapter.*"), F.lit("Original").alias("source") );
df2_flat = df2.select("*", F.posexplode(F.col("Books")).alias("pos","Book")).select( "*", F.col("Book._1").alias("BookId"), F.posexplode(F.col("Book._2")).alias("pos","Chapter") ).select("*", F.col("Chapter.*"), F.lit("Update").alias("source") );
#use a union to pull all data together
all = df_flat.union(df2_flat).withColumnRenamed("_1", "Chapter_id")\
.withColumnRenamed("_2", "text")
#Find things that don't have a match these are the additions/updates/deletion
all.groupBy("AUTHOR_ID","BookId","Chapter_id","text").agg(F.first("source"),F.count("text").alias("count")).where(F.col("count") != 2).show()
+---------+------+----------+-------------+--------------------+-----+
|AUTHOR_ID|BookId|Chapter_id| text|first(source, false)|count|
+---------+------+----------+-------------+--------------------+-----+
| 1| 1| 2| The end| Update| 1|
| 1| 1| 1| The begining| Update| 1|
| 1| 1| 1|The beggining| Original| 1|
+---------+------+----------+-------------+--------------------+-----+
From here you need to do a little more work. (Think 1 more groupBy down to the Author/Bookid/chapterid, (count the chapterid) then select [with/otherwise] logic on source)
If a chapter exists in the Update and Original it's an edit. (count of 2)
Only exists in Update it's addition. (count of 1)
Only exists in Original it's a deletion. (count of 1)
Building back to your struct back from here is up to you, but I think this demonstrates the idea of what's required. Using the page number of the chapter might actually be a good method to detect change. It's certainly cheaper than comparing strings likely not as accurate.
Give a try to:
from gresearch.spark.diff import *
left.diff(right)
See https://github.com/G-Research/spark-extension/blob/master/DIFF.md
I want to modify/filter on a property inside a struct.
Let's say I have a dataframe with the following column :
#+------------------------------------------+
#| arrayCol |
#+------------------------------------------+
#| {"a" : "some_value", "b" : [1, 2, 3]} |
#+------------------------------------------+
Schema:
struct<a:string, b:array<int>>
I want to filter out some values in 'b' property when value inside the array == 1
The result desired is the following :
#+------------------------------------------+
#| arrayCol |
#+------------------------------------------+
#| {"a" : "some_value", "b" : [2, 3]} |
#+------------------------------------------+
Is it possible to do it without extracting the property, filter the values, and re-build another struct ?
Update:
For spark 3.1+, withField can be used to update the struct column without having to recreate all the struct. In your case, you can update the field b using filter function to filter the array values like this:
import pyspark.sql.functions as F
df1 = df.withColumn(
'arrayCol',
F.col('arrayCol').withField('b', F.filter(F.col("arrayCol.b"), lambda x: x != 1))
)
df1.show()
#+--------------------+
#| arrayCol|
#+--------------------+
#|{some_value, [2, 3]}|
#+--------------------+
For older versions, Spark doesn’t support adding/updating fields in nested structures. To update a struct column, you'll need to create a new struct using the existing fields and the updated ones:
import pyspark.sql.functions as F
df1 = df.withColumn(
"arrayCol",
F.struct(
F.col("arrayCol.a").alias("a"),
F.expr("filter(arrayCol.b, x -> x != 1)").alias("b")
)
)
One way would be to define a UDF:
Example:
import ast
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, MapType
def remove_value(col):
col["b"] = str([x for x in ast.literal_eval(col["b"]) if x != 1])
return col
if __name__ == "__main__":
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(
[
{
"arrayCol": {
"a": "some_value",
"b": "[1, 2, 3]",
},
},
]
)
remove_value_udf = spark.udf.register(
"remove_value_udf", remove_value, MapType(StringType(), StringType())
)
df = df.withColumn(
"result",
remove_value_udf(F.col("arrayCol")),
)
Result:
root
|-- arrayCol: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- result: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
+---------------------------------+------------------------------+
|arrayCol |result |
+---------------------------------+------------------------------+
|{a -> some_value, b -> [1, 2, 3]}|{a -> some_value, b -> [2, 3]}|
+---------------------------------+------------------------------+
I have two DataFrames called DF1 and DF2, the content of each DataFrame is as follows:
df1:
line_item_usage_account_id line_item_unblended_cost name
100000000001 12.05 account1
200000000001 52 account2
300000000003 12.03 account3
df2:
accountname accountproviderid clustername app_pmo app_costcenter
account1 100000000001 cluster1 111111 11111111
account2 200000000001 cluster2 222222 22222222
I need to make a join for fields df1.line_item_usage_account_id and df2.accountproviderid
When both fields have the same ID, the value of the DF1 line_item_unblended_cost column must be added.
And when the value of the line_item_usage_account_id field of the DF1 is not in the accountproviderid column of the DF2, the df1 fields must be aggregated as follows:
accountname accountproviderid clustername app_pmo app_costcenter line_item_unblended_cost
account1 100000000001 cluster1 111111 11111111 12.05
account2 200000000001 cluster2 222222 22222222 52
account3 300000000003 NA NA NA 12.03
The account3 data was added at the end of the new DataFrame by filling with "na" the columns of the DF2.
Any help thanks in advance.
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df1 = spark.createDataFrame([
[100000000001, 12.05, 'account1'],
[200000000001, 52.00, 'account2'],
[300000000003, 12.03, 'account3']],
schema=['line_item_usage_account_id', 'line_item_unblended_cost', 'name' ])
df1.show()
df1.printSchema()
df2 = spark.createDataFrame([
['account1', 100000000001, 'cluster1', 111111, 11111111],
['account2', 200000000001, 'cluster2', 222222, 22222222]],
schema=['accountname', 'accountproviderid', 'clustername', 'app_pmo', 'app_costcenter'])
df2.printSchema()
df2.show()
cols = ['name', 'line_item_usage_account_id', 'clustername', 'app_pmo', 'app_costcenter', 'line_item_unblended_cost']
resDF = df1.join(df2, df1.line_item_usage_account_id == df2.accountproviderid, "leftouter").select(*cols).withColumnRenamed('name', 'accountname').withColumnRenamed('line_item_usage_account_id', 'accountproviderid').orderBy('accountname')
resDF.printSchema()
# |-- accountname: string (nullable = true)
# |-- accountproviderid: long (nullable = true)
# |-- clustername: string (nullable = true)
# |-- app_pmo: long (nullable = true)
# |-- app_costcenter: long (nullable = true)
# |-- line_item_unblended_cost: double (nullable = true)
resDF.show()
# +-----------+-----------------+-----------+-------+--------------+------------------------+
# |accountname|accountproviderid|clustername|app_pmo|app_costcenter|line_item_unblended_cost|
# +-----------+-----------------+-----------+-------+--------------+------------------------+
# | account1| 100000000001| cluster1| 111111| 11111111| 12.05|
# | account2| 200000000001| cluster2| 222222| 22222222| 52.0|
# | account3| 300000000003| null| null| null| 12.03|
# +-----------+-----------------+-----------+-------+--------------+------------------------+
I have a dataframe with two columns(one string and one array of string):
root
|-- user: string (nullable = true)
|-- users: array (nullable = true)
| |-- element: string (containsNull = true)
How can I filter the dataframe so that the result dataframe only contains rows that user is in users?
Quick and simple:
import org.apache.spark.sql.functions.expr
df.where(expr("array_contains(users, user)")
Sure, It's possible and not so hard. To achieve this you may use a UDF.
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
val df = sc.parallelize(Array(
("1", Array("1", "2", "3")),
("2", Array("1", "2", "2", "3")),
("3", Array("1", "2"))
)).toDF("user", "users")
val inArray = udf((id: String, array: scala.collection.mutable.WrappedArray[String]) => array.contains(id), BooleanType)
df.where(inArray($"user", $"users")).show()
The output is:
+----+------------+
|user| users|
+----+------------+
| 1| [1, 2, 3]|
| 2|[1, 2, 2, 3]|
+----+------------+
According to the accepted answer in pyspark collect_set or collect_list with groupby, when you do a collect_list on a certain column, the null values in this column are removed. I have checked and this is true.
But in my case, I need to keep the null columns -- How can I achieve this?
I did not find any info on this kind of a variant of collect_list function.
Background context to explain why I want nulls:
I have a dataframe df as below:
cId | eId | amount | city
1 | 2 | 20.0 | Paris
1 | 2 | 30.0 | Seoul
1 | 3 | 10.0 | Phoenix
1 | 3 | 5.0 | null
I want to write this to an Elasticsearch index with the following mapping:
"mappings": {
"doc": {
"properties": {
"eId": { "type": "keyword" },
"cId": { "type": "keyword" },
"transactions": {
"type": "nested",
"properties": {
"amount": { "type": "keyword" },
"city": { "type": "keyword" }
}
}
}
}
}
In order to conform to the nested mapping above, I transformed my df so that for each combination of eId and cId, I have an array of transactions like this:
df_nested = df.groupBy('eId','cId').agg(collect_list(struct('amount','city')).alias("transactions"))
df_nested.printSchema()
root
|-- cId: integer (nullable = true)
|-- eId: integer (nullable = true)
|-- transactions: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- amount: float (nullable = true)
| | |-- city: string (nullable = true)
Saving df_nested as a json file, there are the json records that I get:
{"cId":1,"eId":2,"transactions":[{"amount":20.0,"city":"Paris"},{"amount":30.0,"city":"Seoul"}]}
{"cId":1,"eId":3,"transactions":[{"amount":10.0,"city":"Phoenix"},{"amount":30.0}]}
As you can see - when cId=1 and eId=3, one of my array elements where amount=30.0 does not have the city attribute because this was a null in my original data (df). The nulls are being removed when I use the collect_list function.
However, when I try writing df_nested to elasticsearch with the above index, it errors because there is a schema mismatch. This is basically the reason as to why I want to retain my nulls after applying the collect_list function.
from pyspark.sql.functions import create_map, collect_list, lit, col, to_json, from_json
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext, SparkSession, types, Row
from pyspark.sql import functions as f
import os
app_name = "CollList"
conf = SparkConf().setAppName(app_name)
spark = SparkSession.builder.appName(app_name).config(conf=conf).enableHiveSupport().getOrCreate()
df = spark.createDataFrame([[1, 2, 20.0, "Paris"], [1, 2, 30.0, "Seoul"],
[1, 3, 10.0, "Phoenix"], [1, 3, 5.0, None]],
["cId", "eId", "amount", "city"])
print("Actual data")
df.show(10,False)
```
Actual data
+---+---+------+-------+
|cId|eId|amount|city |
+---+---+------+-------+
|1 |2 |20.0 |Paris |
|1 |2 |30.0 |Seoul |
|1 |3 |10.0 |Phoenix|
|1 |3 |5.0 |null |
+---+---+------+-------+
```
#collect_list that skips null columns
df1 = df.groupBy(f.col('city'))\
.agg(f.collect_list(f.to_json(f.struct([f.col(x).alias(x) for x in (c for c in df.columns if c != 'cId' and c != 'eId' )])))).alias('newcol')
print("Collect List Data - Missing Null Columns in the list")
df1.show(10, False)
```
Collect List Data - Missing Null Columns in the list
+-------+-------------------------------------------------------------------------------------------------------------------+
|city |collect_list(structstojson(named_struct(NamePlaceholder(), amount AS `amount`, NamePlaceholder(), city AS `city`)))|
+-------+-------------------------------------------------------------------------------------------------------------------+
|Phoenix|[{"amount":10.0,"city":"Phoenix"}] |
|null |[{"amount":5.0}] |
|Paris |[{"amount":20.0,"city":"Paris"}] |
|Seoul |[{"amount":30.0,"city":"Seoul"}] |
+-------+-------------------------------------------------------------------------------------------------------------------+
```
my_list = []
for x in (c for c in df.columns if c != 'cId' and c != 'eId' ):
my_list.append(lit(x))
my_list.append(col(x))
grp_by = ["eId","cId"]
df_nested = df.withColumn("transactions", create_map(my_list))\
.groupBy(grp_by)\
.agg(collect_list(f.to_json("transactions")).alias("transactions"))
print("collect list after create_map")
df_nested.show(10,False)
```
collect list after create_map
+---+---+--------------------------------------------------------------------+
|eId|cId|transactions |
+---+---+--------------------------------------------------------------------+
|2 |1 |[{"amount":"20.0","city":"Paris"}, {"amount":"30.0","city":"Seoul"}]|
|3 |1 |[{"amount":"10.0","city":"Phoenix"}, {"amount":"5.0","city":null}] |
+---+---+--------------------------------------------------------------------+
```