Compare and check out differences between two dataframes using pySpark - apache-spark

Let's suppose that we have a dataframe with the following schema
root
|-- AUTHOR_ID: integer (nullable = false)
|-- NAME: string (nullable = true)
|-- Books: array (nullable = false)
| |-- element: struct (containsNull = false)
| | |-- BOOK_ID: integer (nullable = false)
| | |-- Chapters: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- NAME: string (nullable = true)
| | | | |-- NUMBER_PAGES: integer (nullable = true)
As you can see, we nested struct objects
How can we compare two dataframes with the same schema and calculate or bring out the deltas(differences) ?
Let's suppose that the following changes are occured:
the name of the first chapter of book with id=1 was changed, thus we can imagine the following comparison output
{
"AUTHOR_ID": 1,
"Books": [
{
"BOOK_ID": 1,
"Chapters": [
{
"id": 1,
"NAME": {
"before": "Etranger",
"after": "L'étranger"
}
}
]
}
]
}
Note: we will show only the Ids and the changed values for the relevant items

Here is a sample code to join by authorId and then compare.
from pyspark.sql.functions import collect_list, struct, col
from operator import itemgetter
from pyspark.sql.types import ArrayType, StructType, StringType, StructField
# construct data
data = [('AA_1', 'S1', "10", "1", "Introduction to Quadratic Equation"),
('AA_1', 'S1', "10", "2", "Fundamentals"),
('AA_1', 'S1', "11", "1", "Preface"),
('AA_1', 'S1', "11", "2", "Wading in to the waters"),
('AA_2', 'S2', "100", "1", "Introduction"),
('AA_2', 'S2', "100", "2", "Fundamentals"),
('AA_2', 'S2', "110", "1", "Prologue"),
('AA_2', 'S2', "110", "2", "Epilogue"),
]
data2 = [('AA_1', 'S1', "10", "1", "Introduction to Linear Algebra"),
('AA_1', 'S1', "10", "2", "Fundamentals"),
('AA_1', 'S1', "11", "1", "Preface"),
('AA_1', 'S1', "11", "2", "Wading in to the waters"),
('AA_2', 'S2', "100", "1", "Introduction"),
('AA_2', 'S2', "100", "2", "Fundamentals2"),
('AA_2', 'S2', "110", "1", "Prologue"),
('AA_2', 'S2', "110", "2", "Epilogue"),
]
df = spark.createDataFrame(data, ["authorId", "name", "bookId", "chapterId", "chapterName"]).groupBy(['authorId', 'name', 'bookId']).agg(collect_list(struct("chapterId", "chapterName")).alias("chapters")).groupBy(['authorId', 'name']).agg(collect_list(struct('bookId', 'chapters')).alias('books'))
df2 = spark.createDataFrame(data2, ["authorId", "name", "bookId", "chapterId", "chapterName"]).groupBy(['authorId', 'name', 'bookId']).agg(collect_list(struct("chapterId", "chapterName")).alias("chapters")).groupBy(['authorId', 'name']).agg(collect_list(struct('bookId', 'chapters')).alias('books'))
df2 = df2.select(col('authorId').alias('authorId2'),col('name').alias('name2'), col('books').alias('books2') )
# join on authorId
df3 = df.join(df2, [df.authorId == df2.authorId2])
# UDF to compare, needs additional checks on books and chapters lengths and Null checks
#udf(ArrayType(StructType([StructField("bookId", StringType()), StructField("chapters", ArrayType(StructType([StructField("chapterId", StringType()), StructField("name", StructType([StructField("before", StringType()), StructField("after", StringType())]))])))])))
def get_book_diff(b1, b2):
if (len(b1) != len(b2)):
return None
b1.sort(key = itemgetter('bookId'))
b2.sort(key = itemgetter('bookId'))
list_data = []
i=0
for book in b1:
data = {}
if book.bookId == b2[i].bookId:
data['bookId']=book.bookId
book.chapters.sort(key = itemgetter('chapterId'))
b2[i].chapters.sort(key = itemgetter('chapterId'))
data['chapters']=[]
j=0
for chap in book.chapters:
if chap.chapterId == b2[i].chapters[j].chapterId:
if chap.chapterName != b2[i].chapters[j].chapterName:
data['chapters'].append({'chapterId':chap.chapterId, 'name': {"before": chap.chapterName, "after": b2[i].chapters[j].chapterName}})
j+=1
i+=1
list_data.append(data)
return list_data
df3 = df3.withColumn('book_diff', get_book_diff('books', 'books2'))
#df3.select('authorId', 'book_diff').show(truncate=False)
display(df3.select('authorId', 'book_diff'))

I think here the unfortunate requirement is we need to flatten the struct into columns to allow comparison.
import pyspark.sql.functions as F
columns = ["AUTHOR_ID","NAME","Books"] # lazy partial naming
#Original
data = [(1, "James,,Smith",[(1,[(1,"The beggining", 12, "It was a great day")])]), (2, "Stephen King", [(2,[(1,"The start", 12, "It was a great day")])])]
#Update
# Bookid 1 --> added a chapter, fixed a typo in the first chapter.
# Bookid 2 --> Changed nothing
data_after = [(1, "James,,Smith",[(1,[(1,"The begining", 12, "It was a great day"),(2,"The end", 1, "It was a great night")])]), (2, "Stephen King", [(2,[(1,"The start", 12, "It was an a great day")])])]
df = spark.createDataFrame(data=data,schema=columns)
df2 = spark.createDataFrame(data=data_after,schema=columns)
#flatten the struct into columns Could have use withColumn.
df_flat = df.select("*", F.posexplode(F.col("Books")).alias("pos","Book")).select( "*", F.col("Book._1").alias("BookId"), F.posexplode(F.col("Book._2")).alias("pos","Chapter") ).select("*", F.col("Chapter.*"), F.lit("Original").alias("source") );
df2_flat = df2.select("*", F.posexplode(F.col("Books")).alias("pos","Book")).select( "*", F.col("Book._1").alias("BookId"), F.posexplode(F.col("Book._2")).alias("pos","Chapter") ).select("*", F.col("Chapter.*"), F.lit("Update").alias("source") );
#use a union to pull all data together
all = df_flat.union(df2_flat).withColumnRenamed("_1", "Chapter_id")\
.withColumnRenamed("_2", "text")
#Find things that don't have a match these are the additions/updates/deletion
all.groupBy("AUTHOR_ID","BookId","Chapter_id","text").agg(F.first("source"),F.count("text").alias("count")).where(F.col("count") != 2).show()
+---------+------+----------+-------------+--------------------+-----+
|AUTHOR_ID|BookId|Chapter_id| text|first(source, false)|count|
+---------+------+----------+-------------+--------------------+-----+
| 1| 1| 2| The end| Update| 1|
| 1| 1| 1| The begining| Update| 1|
| 1| 1| 1|The beggining| Original| 1|
+---------+------+----------+-------------+--------------------+-----+
From here you need to do a little more work. (Think 1 more groupBy down to the Author/Bookid/chapterid, (count the chapterid) then select [with/otherwise] logic on source)
If a chapter exists in the Update and Original it's an edit. (count of 2)
Only exists in Update it's addition. (count of 1)
Only exists in Original it's a deletion. (count of 1)
Building back to your struct back from here is up to you, but I think this demonstrates the idea of what's required. Using the page number of the chapter might actually be a good method to detect change. It's certainly cheaper than comparing strings likely not as accurate.

Give a try to:
from gresearch.spark.diff import *
left.diff(right)
See https://github.com/G-Research/spark-extension/blob/master/DIFF.md

Related

Spark Dataframe manipulation

Input Dataframe:
caseid
indicator
1
STP
1
non-STP
2
STP
3
STP
3
non-STP
output Dataframe:
caseid
indicator
1
non-STP
2
STP
3
non-STP
Hello all, I would be really grateful if someone can help me in the above dataframe. in the output dataframe, I only want to keep the cases where the indicator is non-STP, whereas in the cases where the cases in STP keep that as it is.
Thanks in Advance
You could try with groupby and then check if values contain non-STP.
Example:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.getOrCreate()
data = [
{"caseid": "1", "indicator": "STP"},
{"caseid": "1", "indicator": "non-STP"},
{"caseid": "2", "indicator": "STP"},
{"caseid": "3", "indicator": "STP"},
{"caseid": "3", "indicator": "non-STP"},
]
df = spark.createDataFrame(data)
df = (
df.groupBy("caseid")
.agg(F.concat_ws(",", F.collect_list(F.col("indicator"))).alias("indicator"))
.orderBy("caseid")
)
df = df.withColumn(
"indicator",
F.when(F.col("indicator").contains("non-STP"), F.lit("non-STP")).otherwise(
F.lit("STP")
),
)
Result:
root
|-- caseid: string (nullable = true)
|-- indicator: string (nullable = false)
+------+---------+
|caseid|indicator|
+------+---------+
|1 |non-STP |
|2 |STP |
|3 |non-STP |
+------+---------+

Modify nested property inside Struct column with PySpark

I want to modify/filter on a property inside a struct.
Let's say I have a dataframe with the following column :
#+------------------------------------------+
#| arrayCol |
#+------------------------------------------+
#| {"a" : "some_value", "b" : [1, 2, 3]} |
#+------------------------------------------+
Schema:
struct<a:string, b:array<int>>
I want to filter out some values in 'b' property when value inside the array == 1
The result desired is the following :
#+------------------------------------------+
#| arrayCol |
#+------------------------------------------+
#| {"a" : "some_value", "b" : [2, 3]} |
#+------------------------------------------+
Is it possible to do it without extracting the property, filter the values, and re-build another struct ?
Update:
For spark 3.1+, withField can be used to update the struct column without having to recreate all the struct. In your case, you can update the field b using filter function to filter the array values like this:
import pyspark.sql.functions as F
df1 = df.withColumn(
'arrayCol',
F.col('arrayCol').withField('b', F.filter(F.col("arrayCol.b"), lambda x: x != 1))
)
df1.show()
#+--------------------+
#| arrayCol|
#+--------------------+
#|{some_value, [2, 3]}|
#+--------------------+
For older versions, Spark doesn’t support adding/updating fields in nested structures. To update a struct column, you'll need to create a new struct using the existing fields and the updated ones:
import pyspark.sql.functions as F
df1 = df.withColumn(
"arrayCol",
F.struct(
F.col("arrayCol.a").alias("a"),
F.expr("filter(arrayCol.b, x -> x != 1)").alias("b")
)
)
One way would be to define a UDF:
Example:
import ast
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, MapType
def remove_value(col):
col["b"] = str([x for x in ast.literal_eval(col["b"]) if x != 1])
return col
if __name__ == "__main__":
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(
[
{
"arrayCol": {
"a": "some_value",
"b": "[1, 2, 3]",
},
},
]
)
remove_value_udf = spark.udf.register(
"remove_value_udf", remove_value, MapType(StringType(), StringType())
)
df = df.withColumn(
"result",
remove_value_udf(F.col("arrayCol")),
)
Result:
root
|-- arrayCol: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- result: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
+---------------------------------+------------------------------+
|arrayCol |result |
+---------------------------------+------------------------------+
|{a -> some_value, b -> [1, 2, 3]}|{a -> some_value, b -> [2, 3]}|
+---------------------------------+------------------------------+

How to perform calculation in spark dataframe that select from its own dataframe using pyspark

I have a pyspark schema which look like this :
root
|-- id: string (nullable = true)
|-- long: float (nullable = true)
|-- lat: float (nullable = true)
|-- geohash: string (nullable = true)
|-- neighbors: array (nullable = true)
| |-- element: string (containsNull = true)
The data look like this :
+---+---------+----------+---------+--------------------+
| id| lat| long|geohash_8| neighbors|
+---+---------+----------+---------+--------------------+
| 0|-6.361755| 106.79653| qqggy1yu|[qqggy1ys, qqggy1...|
| 1|-6.358584|106.793945| qqggy4ky|[qqggy4kw, qqggy4...|
| 2|-6.362967|106.798775| qqggy38m|[qqggy38j, qqggy3...|
| 3|-6.358316| 106.79832| qqggy680|[qqggy4xb, qqggy6...|
| 4| -6.36016| 106.7981| qqggy60j|[qqggy4pv, qqggy6...|
| 5|-6.357476| 106.79842| qqggy68j|[qqggy4xv, qqggy6...|
| 6|-6.360814| 106.79435| qqggy4j3|[qqggy4j1, qqggy4...|
| 7|-6.358231|106.794365| qqggy4t2|[qqggy4t0, qqggy4...|
| 8|-6.357654| 106.79736| qqggy4x7|[qqggy4x5, qqggy4...|
| 9|-6.358781|106.794624| qqggy4mm|[qqggy4mj, qqggy4...|
| 10|-6.357654| 106.79443| qqggy4t7|[qqggy4t5, qqggy4...|
| 11|-6.357079| 106.79443| qqggy4tr|[qqggy4tp, qqggy4...|
| 12|-6.359929| 106.79698| qqggy4pn|[qqggy4ny, qqggy4...|
| 13|-6.358111| 106.79633| qqggy4w9|[qqggy4w3, qqggy4...|
| 14|-6.359685| 106.79607| qqggy4q8|[qqggy4q2, qqggy4...|
| 15|-6.357945|106.794945| qqggy4td|[qqggy4t6, qqggy4...|
| 16|-6.360725|106.795456| qqggy4n4|[qqggy4jf, qqggy4...|
| 17|-6.363701| 106.79653| qqggy1wb|[qqggy1w8, qqggy1...|
| 18| -6.36329|106.794586| qqggy1t7|[qqggy1t5, qqggy1...|
| 19|-6.363304| 106.79429| qqggy1t5|[qqggy1sg, qqggy1...|
+---+---------+----------+---------+--------------------+
I want to calculate the distance from each id with its lat long and select all the lat long from all his neighbors then calculate the distance. Then every id will have list of distances in meters with all his neighbors.
I tried using iterative way, which loop every rows then select a dataframe then compute the haversine distance, However the performance is awful. I am stuck on how to apply using functional way in spark. Can anyone help with some suggestion or references.
Updated to address desire for combinations
If you want to do all the combinations, the steps are basically, associate each neighbor ID with it's lat/long, group them together into a single row for each combination set, then do compute distance on all the combinations. Here is example code:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Row
import itertools
schema = StructType([
StructField("id", StringType()),
StructField("lat", FloatType()),
StructField("long", FloatType()),
StructField("geohash_8", StringType()),
StructField("neighbors", ArrayType(StringType()))
])
data = [
("0", 10.0, 11.0, "A", ["B", "C", "D"]),
("1", 12.0, 13.0, "B", ["D"]),
("2", 14.0, 15.0, "C", []),
("3", 16.0, 17.0, "D", [])
]
input_df = spark.createDataFrame(sc.parallelize(data), schema)
# Explode to get a row for each comparison pair
df = input_df.withColumn('neighbor', explode('neighbors')).drop('neighbors')
# Join to get the lat/lon of the neighbor
neighbor_map = input_df.selectExpr('geohash_8 as nid', 'lat as nlat', 'long as nlong')
df = df.join(neighbor_map , col('neighbor') == col('nid'), 'inner').drop('nid')
# Add in rows for the root (geohash_8) records before grouping
root_rows = input_df.selectExpr("id", "lat", "long", "geohash_8", "geohash_8 as neighbor", "lat as nlat", "long as nlong")
df = df.unionAll(root_rows)
# Group by to roll the rows back up but now associating the lat/lon w/ the neighbors
df = df.selectExpr("id", "lat", "long", "geohash_8", "struct(neighbor, nlat, nlong) as neighbors")
df = df.groupBy("id", "lat", "long", "geohash_8").agg(collect_set("neighbors").alias("neighbors"))
# You now have all the data you need in one field, so you can write a python udf to do the combinations
def compute_distance(left_lat, left_lon, right_lat, right_lon):
return 10.0
def combinations(neighbors):
result = []
for left, right in itertools.combinations(neighbors, 2):
dist = compute_distance(left['nlat'], left['nlong'], right['nlat'], right['nlong'])
result.append(Row(left=left['neighbor'], right=right['neighbor'], dist=dist))
return result
udf_schema = ArrayType(StructType([
StructField("left", StringType()),
StructField("right", StringType()),
StructField("dist", FloatType())
]))
combinations_udf = udf(combinations, udf_schema)
# Finally, apply the UDF
df = df.withColumn('neighbors', combinations_udf(col('neighbors')))
df.printSchema()
df.show()
Which produces this:
root
|-- id: string (nullable = true)
|-- lat: float (nullable = true)
|-- long: float (nullable = true)
|-- geohash_8: string (nullable = true)
|-- neighbors: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- neighbor: string (nullable = true)
| | |-- nlat: float (nullable = true)
| | |-- nlong: float (nullable = true)
+---+----+----+---------+------------------------------------------------------------------------------------+
|id |lat |long|geohash_8|neighbors |
+---+----+----+---------+------------------------------------------------------------------------------------+
|0 |10.0|11.0|A |[[D, C, 10.0], [D, A, 10.0], [D, B, 10.0], [C, A, 10.0], [C, B, 10.0], [A, B, 10.0]]|
|2 |14.0|15.0|C |[] |
|1 |12.0|13.0|B |[[D, B, 10.0]] |
|3 |16.0|17.0|D |[] |
+---+----+----+---------+------------------------------------------------------------------------------------+

How to use Spark SQL SPLIT function to pass input to Spark SQL IN parameter [duplicate]

I have a dataframe with two columns(one string and one array of string):
root
|-- user: string (nullable = true)
|-- users: array (nullable = true)
| |-- element: string (containsNull = true)
How can I filter the dataframe so that the result dataframe only contains rows that user is in users?
Quick and simple:
import org.apache.spark.sql.functions.expr
df.where(expr("array_contains(users, user)")
Sure, It's possible and not so hard. To achieve this you may use a UDF.
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
val df = sc.parallelize(Array(
("1", Array("1", "2", "3")),
("2", Array("1", "2", "2", "3")),
("3", Array("1", "2"))
)).toDF("user", "users")
val inArray = udf((id: String, array: scala.collection.mutable.WrappedArray[String]) => array.contains(id), BooleanType)
df.where(inArray($"user", $"users")).show()
The output is:
+----+------------+
|user| users|
+----+------------+
| 1| [1, 2, 3]|
| 2|[1, 2, 2, 3]|
+----+------------+

Pypsark - Retain null values when using collect_list

According to the accepted answer in pyspark collect_set or collect_list with groupby, when you do a collect_list on a certain column, the null values in this column are removed. I have checked and this is true.
But in my case, I need to keep the null columns -- How can I achieve this?
I did not find any info on this kind of a variant of collect_list function.
Background context to explain why I want nulls:
I have a dataframe df as below:
cId | eId | amount | city
1 | 2 | 20.0 | Paris
1 | 2 | 30.0 | Seoul
1 | 3 | 10.0 | Phoenix
1 | 3 | 5.0 | null
I want to write this to an Elasticsearch index with the following mapping:
"mappings": {
"doc": {
"properties": {
"eId": { "type": "keyword" },
"cId": { "type": "keyword" },
"transactions": {
"type": "nested",
"properties": {
"amount": { "type": "keyword" },
"city": { "type": "keyword" }
}
}
}
}
}
In order to conform to the nested mapping above, I transformed my df so that for each combination of eId and cId, I have an array of transactions like this:
df_nested = df.groupBy('eId','cId').agg(collect_list(struct('amount','city')).alias("transactions"))
df_nested.printSchema()
root
|-- cId: integer (nullable = true)
|-- eId: integer (nullable = true)
|-- transactions: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- amount: float (nullable = true)
| | |-- city: string (nullable = true)
Saving df_nested as a json file, there are the json records that I get:
{"cId":1,"eId":2,"transactions":[{"amount":20.0,"city":"Paris"},{"amount":30.0,"city":"Seoul"}]}
{"cId":1,"eId":3,"transactions":[{"amount":10.0,"city":"Phoenix"},{"amount":30.0}]}
As you can see - when cId=1 and eId=3, one of my array elements where amount=30.0 does not have the city attribute because this was a null in my original data (df). The nulls are being removed when I use the collect_list function.
However, when I try writing df_nested to elasticsearch with the above index, it errors because there is a schema mismatch. This is basically the reason as to why I want to retain my nulls after applying the collect_list function.
from pyspark.sql.functions import create_map, collect_list, lit, col, to_json, from_json
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext, SparkSession, types, Row
from pyspark.sql import functions as f
import os
app_name = "CollList"
conf = SparkConf().setAppName(app_name)
spark = SparkSession.builder.appName(app_name).config(conf=conf).enableHiveSupport().getOrCreate()
df = spark.createDataFrame([[1, 2, 20.0, "Paris"], [1, 2, 30.0, "Seoul"],
[1, 3, 10.0, "Phoenix"], [1, 3, 5.0, None]],
["cId", "eId", "amount", "city"])
print("Actual data")
df.show(10,False)
```
Actual data
+---+---+------+-------+
|cId|eId|amount|city |
+---+---+------+-------+
|1 |2 |20.0 |Paris |
|1 |2 |30.0 |Seoul |
|1 |3 |10.0 |Phoenix|
|1 |3 |5.0 |null |
+---+---+------+-------+
```
#collect_list that skips null columns
df1 = df.groupBy(f.col('city'))\
.agg(f.collect_list(f.to_json(f.struct([f.col(x).alias(x) for x in (c for c in df.columns if c != 'cId' and c != 'eId' )])))).alias('newcol')
print("Collect List Data - Missing Null Columns in the list")
df1.show(10, False)
```
Collect List Data - Missing Null Columns in the list
+-------+-------------------------------------------------------------------------------------------------------------------+
|city |collect_list(structstojson(named_struct(NamePlaceholder(), amount AS `amount`, NamePlaceholder(), city AS `city`)))|
+-------+-------------------------------------------------------------------------------------------------------------------+
|Phoenix|[{"amount":10.0,"city":"Phoenix"}] |
|null |[{"amount":5.0}] |
|Paris |[{"amount":20.0,"city":"Paris"}] |
|Seoul |[{"amount":30.0,"city":"Seoul"}] |
+-------+-------------------------------------------------------------------------------------------------------------------+
```
my_list = []
for x in (c for c in df.columns if c != 'cId' and c != 'eId' ):
my_list.append(lit(x))
my_list.append(col(x))
grp_by = ["eId","cId"]
df_nested = df.withColumn("transactions", create_map(my_list))\
.groupBy(grp_by)\
.agg(collect_list(f.to_json("transactions")).alias("transactions"))
print("collect list after create_map")
df_nested.show(10,False)
```
collect list after create_map
+---+---+--------------------------------------------------------------------+
|eId|cId|transactions |
+---+---+--------------------------------------------------------------------+
|2 |1 |[{"amount":"20.0","city":"Paris"}, {"amount":"30.0","city":"Seoul"}]|
|3 |1 |[{"amount":"10.0","city":"Phoenix"}, {"amount":"5.0","city":null}] |
+---+---+--------------------------------------------------------------------+
```

Resources