What is the most efficient way to do two different joins on the same two dataframes in Pyspark - apache-spark

I am trying to compare two dataframes to look for new records and updated records, which in turn will be used to create a third dataframe. I am using Pyspark 2.4.3
As I come from a SQL background (ASE), my initial thought would be to do a left join to find new records and a != on a hash of all the columns to find updates:
SELECT a.*
FROM Todays_Data a
Left Join Yesterdays_PK_And_Hash b on a.pk = b.pk
WHERE (b.pk IS NULL) --finds new records
OR (b.hashOfColumns != HASHBYTES('md5',<converted and concatenated columns>)) --updated records
I have been playing around with Pyspark and have come up with a script that achieves the results I am after:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import md5, concat_ws, col, lit
sc = SparkContext("local", "test App")
sqlContext = SQLContext(sc)
sp = SparkSession \
.builder \
.appName("test App") \
.getOrCreate()
df = sp.createDataFrame(
[("Fred", "Smith", "16ba5519cdb13f99e087473e4faf3825"), # hashkey here is created based on YOB of 1973. To test for an update
("Fred", "Davis", "253ab75676cdbd73b874c97a62d27608"),
("Barry", "Clarke", "cc3baaa05a1146f2f8cf0a743c9ab8c4")],
["First_name", "Last_name", "hashkey"]
)
df_a = sp.createDataFrame(
[("Fred", "Smith", "Adelaide", "Doctor", 1971),
("Fred", "Davis", "Melbourne", "Baker", 1970),
("Barry", "Clarke", "Sydney", "Scientist", 1975),
("Jane", "Hall", "Sydney", "Dentist", 1980)],
["First_name", "Last_name", "City", "Occupation", "YOB"]
)
df_a = df_a.withColumn("hashkey", md5(concat_ws("", *df_a.columns)))
df_ins = df_a.alias('a').join(df.alias('b'), (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')), 'left_anti') \
.select(lit("Insert").alias("_action"), 'a.*') \
.dropDuplicates()
df_up = df_a.alias('a').join(df.alias('b'), (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')) &
(col('a.hashkey') != col('b.hashkey')), 'inner') \
.select(lit("Update").alias("_action"), 'a.*') \
.dropDuplicates()
df_delta = df_ins.union(df_up).sort("YOB")
df_delta = df_delta.drop("hashkey")
df_delta.show(truncate=False)
What this produces is my final delta as such:
+-------+----------+---------+--------+----------+----+
|_action|First_name|Last_name|City |Occupation|YOB |
+-------+----------+---------+--------+----------+----+
|Update |Fred |Smith |Adelaide|Doctor |1971|
|Insert |Jane |Hall |Sydney |Dentist |1980|
+-------+----------+---------+--------+----------+----+
While I am getting the results I am after, I am unsure how efficient the above code is.
Ultimately in the end, I would like to run similar patterns against datasets into the 100's of million records.
Is there anyway to make this more efficient?
Thanks

Have you explored broadcast join? Your join statements could be problematic if you have 100M + records. If the dataset B is smaller, this would the tiny modification I would try:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import md5, concat_ws, col, lit, broadcast
sc = SparkContext("local", "test App")
sqlContext = SQLContext(sc)
sp = SparkSession \
.builder \
.appName("test App") \
.getOrCreate()
df = sp.createDataFrame(
[("Fred", "Smith", "16ba5519cdb13f99e087473e4faf3825"), # hashkey here is created based on YOB of 1973. To test for an update
("Fred", "Davis", "253ab75676cdbd73b874c97a62d27608"),
("Barry", "Clarke", "cc3baaa05a1146f2f8cf0a743c9ab8c4")],
["First_name", "Last_name", "hashkey"]
)
df_a = sp.createDataFrame(
[("Fred", "Smith", "Adelaide", "Doctor", 1971),
("Fred", "Davis", "Melbourne", "Baker", 1970),
("Barry", "Clarke", "Sydney", "Scientist", 1975),
("Jane", "Hall", "Sydney", "Dentist", 1980)],
["First_name", "Last_name", "City", "Occupation", "YOB"]
)
df_a = df_a.withColumn("hashkey", md5(concat_ws("", *df_a.columns)))
df_ins = df_a.alias('a').join(broadcast(df.alias('b')), (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')), 'left_anti') \
.select(lit("Insert").alias("_action"), 'a.*') \
.dropDuplicates()
df_up = df_a.alias('a').join(broadcast(df.alias('b')), (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')) &
(col('a.hashkey') != col('b.hashkey')), 'inner') \
.select(lit("Update").alias("_action"), 'a.*') \
.dropDuplicates()
df_delta = df_ins.union(df_up).sort("YOB")
Maybe rewriting the code cleanly would be easier to follow too.
#Ash, from a readability standpoint, you could do a couple of things:
Use variables
Use functions.
Use pep-8 guiding style as much as possible. (ex: no more than 80 chars in a line)
joinExpr = (col('a.First_name') == col('b.First_name')) &
(col('a.Last_name') == col('b.Last_name')
joinType = 'left_anti'
df_up = df_a.alias('a').join(broadcast(df.alias('b')), joinExpr) &
(col('a.hashkey') != col('b.hashkey')), joinType) \
.select(lit("Update").alias("_action"), 'a.*') \
.dropDuplicates()
This is still long, but you get the idea.

Related

Why does Spark SQL require double percent symbols for like conditions?

When I am using a like condition in Spark SQL, it seems that it requires the use of 2 percent symbols %%.
However, I could not find any documentation on this in the Spark SQL docs. I am curious as to why my set-up might be causing this requirement.
https://spark.apache.org/docs/3.3.0/sql-ref-syntax-qry-select-like.html
Example data
product_table
id
product_type
region
location
measurement
43635665
ORANGE - Blood Orange
EU
FRA
30.5
78960788
APPLE GrannySmith
NA
USA
16.0
12312343
APPLE [Organic Washington]
NA
CAN
7.1
67867634
ORANGE, NavelOrange
NA
MEX
88.4
import pyspark
from pyspark.sql import functions as F
APP_NAME = "Product: Fruit Template"
SPARK_CONF = [
("spark.dynamicAllocation.maxExecutors", "5"),
("spark.executor.memory", "10g"),
("spark.executor.cores", "4"),
("spark.executor.memoryOverhead", "2000"),
]
spark_conf = pyspark.SparkConf()
spark_conf.setAppName(APP_NAME)
spark_conf.setAll(SPARK_CONF)
sc = pyspark.SparkContext(conf=spark_conf)
spark = pyspark.sql.SparkSession(sc)
def sql(query):
return spark.sql(query)
df = sql("""
SELECT *
FROM product_table
""")
this returns data
df.filter(F.col("product_type").like("ORANGE%%")).show()
whereas this returns an empty dataframe
df.filter(F.col("product_type").like("ORANGE%")).show()
Maybe worth noting, the same issue happens when the LIKE condition is used in the SQL string
this returns data
df_new = sql("""
SELECT *
FROM product_table
WHERE product_type like 'ORANGE%%'
""")
df_new.show()
whereas this returns an empty dataframe
df_new = sql("""
SELECT *
FROM product_table
WHERE product_type like 'ORANGE%'
""")
df_new.show()
i am using PySpark Version: 2.3.2.
conf = (SparkConf()
.set("spark.executor.instances", "24")
.set("spark.executor.cores", "5")
.set("spark.executor.memory", "33g")
.set("spark.driver.memory", "55g")
.set("spark.driver.maxResultSize", "10g")
.set("spark.sql.catalogImplementation", "hive")
.set("mapreduce.fileoutputcommitter.algorithm.version", "2")
)
spark = (
SparkSession.builder.appName("default")
.enableHiveSupport()
.config(conf=conf)
.getOrCreate()
)
df = spark.createDataFrame(
[('43635665','ORANGE - Blood Orange'),
('78960788','APPLE GrannySmith'),
('12312343','APPLE [Organic Washington'),
('67867634','ORANGE, NavelOrange')],
['id', 'product_type'])
df.createOrReplaceTempView("product_table")
def sql(query):
print(query)
return spark.sql(query)
df2 = sql("""
SELECT *
FROM product_table
""")
df2.filter(F.col("product_type").like("ORANGE%")).show(truncate=False)

Connect to MSSQL from PySpark

I am trying to connect to MS SQL DB from PySpark using spark.read.jdbc.
import os
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext;
from pyspark.sql.session import SparkSession
sc = SparkContext('xx')
spark = SparkSession(sc)
spark.read.jdbc('DESKTOP-XXXX\SQLEXPRESS',
"""(select COL1, COL2 from tbl1 WHERE COL1 = 2) """,
properties={'user': sa, 'password': 12345, 'driver': xxxx})
I do not know sc = SparkContext('xx') and 'driver': xxxx which parameters should I pass?
Replace serveraddress with your address of database:
sc = SparkContext()
spark = SparkSession(sc)
spark.read \
.format('jdbc') \
.option('url', 'jdbc:sqlserver://serveraddress:1433') \
.option('user', 'sa') \
.option('password', '12345') \
.option('dbtable', '(select COL1, COL2 from tbl1 WHERE COL1 = 2)')

Explode array values into multiple columns using PySpark

I am new to pyspark and I want to explode array values in such a way that each value gets assigned to a new column. I tried using explode but I couldn't get the desired output. Below is my output
this is the code
from pyspark.sql import *
from pyspark.sql.functions import explode
if __name__ == "__main__":
spark = SparkSession.builder \
.master("local[3]") \
.appName("DataOps") \
.getOrCreate()
dataFrameJSON = spark.read \
.option("multiLine", True) \
.option("mode", "PERMISSIVE") \
.json("data.json")
dataFrameJSON.printSchema()
sub_DF = dataFrameJSON.select(explode("values.line").alias("new_values"))
sub_DF.printSchema()
sub_DF2 = sub_DF.select("new_values.*")
sub_DF2.printSchema()
sub_DF.show(truncate=False)
new_DF = sub_DF2.select("id", "period.*", "property")
new_DF.show(truncate=False)
new_DF.printSchema()
this is data:
{
"values" : {
"line" : [
{
"id" : 1,
"period" : {
"start_ts" : "2020-01-01T00:00:00",
"end_ts" : "2020-01-01T00:15:00"
},
"property" : [
{
"name" : "PID",
"val" : "P120E12345678"
},
{
"name" : "EngID",
"val" : "PANELID00000000"
},
{
"name" : "TownIstat",
"val" : "12058091"
},
{
"name" : "ActiveEng",
"val" : "5678.1"
}
]
}
}
Could you include the data instead of screenshots ?
Meanwhile, assuming that df is the dataframe being used, what we need to do, is to create a new dataframe, while exrtracting the vals from the previous property array to new columns, and droping the property column at last :
from pyspark.sql.functions import col
output_df = df.withColumn("PID", col("property")[0].val).withColumn("EngID", col("property")[1].val).withColumn("TownIstat", col("property")[2].val).withColumn("ActiveEng", col("property")[3].val).drop("property")
In case the elementwas of type ArrayType use the following :
from pyspark.sql.functions import col
output_df = df.withColumn("PID", col("property")[0][1]).withColumn("EngID", col("property")[1][1]).withColumn("TownIstat", col("property")[2][1]).withColumn("ActiveEng", col("property")[3][1]).drop("property")
Explode will explode the arrays into new Rows, not columns, see this : pyspark explode
This is a general solution and works even when the JSONs are messy (different ordering of elements or if some of the elements are missing)
You got to flatten first, regexp_replace to split the 'property' column and finally pivot. This also avoids hard coding of the new column names.
Constructing your dataframe:
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.functions import *
schema = StructType([StructField("id", IntegerType()), StructField("start_ts", StringType()), StructField("end_ts", StringType()), \
StructField("property", ArrayType(StructType( [StructField("name", StringType()), StructField("val", StringType())] )))])
data = [[1, "2010", "2020", [["PID", "P123"], ["Eng", "PA111"], ["Town", "999"], ["Act", "123.1"]]],\
[2, "2011", "2012", [["PID", "P456"], ["Eng", "PA222"], ["Town", "777"], ["Act", "234.1"]]]]
df = spark.createDataFrame(data,schema=schema)
df.show(truncate=False)
+---+--------+------+------------------------------------------------------+
|id |start_ts|end_ts|property |
+---+--------+------+------------------------------------------------------+
|1 |2010 |2020 |[[PID, P123], [Eng, PA111], [Town, 999], [Act, 123.1]]|
|2 |2011 |2012 |[[PID, P456], [Eng, PA222], [Town, 777], [Act, 234.1]]|
+---+--------+------+------------------------------------------------------+
Flattening and pivoting:
df_flatten = df.rdd.flatMap(lambda x: [(x[0],x[1], x[2], y) for y in x[3]]).toDF(['id', 'start_ts', 'end_ts', 'property'])\
.select('id', 'start_ts', 'end_ts', col("property").cast("string"))
df_split = df_flatten.select('id', 'start_ts', 'end_ts', regexp_replace(df_flatten.property, "[\[\]]", "").alias("replacced_col"))\
.withColumn("arr", split(col("replacced_col"), ", "))\
.select(col("arr")[0].alias("col1"), col("arr")[1].alias("col2"), 'id', 'start_ts', 'end_ts')
final_df = df_split.groupby(df_split.id,)\
.pivot("col1")\
.agg(first("col2"))\
.join(df,'id').drop("property")
Output:
final_df.show()
+---+-----+-----+----+----+--------+------+
| id| Act| Eng| PID|Town|start_ts|end_ts|
+---+-----+-----+----+----+--------+------+
| 1|123.1|PA111|P123| 999| 2010| 2020|
| 2|234.1|PA222|P456| 777| 2011| 2012|
+---+-----+-----+----+----+--------+------+

PySpark: data doesn't always conform to schema - logic to alter data

I'm new with PySpark and am working on a script, reading from .csv files.
I've explicitly defined the schema in the below & the script works perfectly...most of the time.
The issue is, on occasion, a value enters the files which does not conform to the schema - e.g. '-' might appear in an integer field & hence, we get a type error - the error is thrown when df1.show() is reached in the script.
I'm trying to think of a way to effectively say - if the value does not match the defined datatype, then replace with ''
Does anyone know if this may be possible? Any advice would be great!
from pyspark.sql import SparkSession
import pyspark.sql.functions as sqlfunc
from pyspark.sql.types import *
import argparse, sys
from pyspark.sql import *
from pyspark.sql.functions import *
from datetime import datetime
#create a context that supports hive
def create_session(appname):
spark_session = SparkSession\
.builder\
.appName(appname)\
.master('yarn')\
.config("hive.metastore.uris", "thrift://serverip:9083")\
.enableHiveSupport()\
.getOrCreate()
return spark_session
### START MAIN ###
if __name__ == '__main__':
spark_session = create_session('testing_files')
dt_now = datetime.now()
today_unixtime = long(dt_now.strftime('%s'))
today_date = datetime.fromtimestamp(today_unixtime).strftime('%Y%m%d')
twoday_unixtime = long(dt_now.strftime('%s')) - 24*60*60*2
twoday = datetime.fromtimestamp(twoday_unixtime).strftime('%Y%m%d')
hourago = long(dt_now.strftime('%s')) - 60*60*4
hrdate = datetime.fromtimestamp(hourago).strftime('%H')
schema = [\
StructField('field1', StringType(), True),\
StructField('field2',StringType(), True), \
StructField('field3',IntegerType(), True) \
]
final_structure = StructType(schema)
df1 = spark_session.read\
.option("header","false")\
.option("delimiter", "\t")\
.csv('hdfs://hdfspath/dt=%s/*/*/*' %today_date, final_structure)
usercatschema = [\
StructField('field1', StringType(), True),\
StructField('field2',StringType(), True), \
StructField('field3',StringType(), True) \
]
usercat_structure = StructType(usercatschema)
df2 = spark_session.read\
.option("header","false")\
.option("delimiter", "\t")\
.csv('hdfs://hdfspath/v0/dt=%s/*' %twoday, usercat_structure)
df1.show()
df2.show()
df1.createOrReplaceTempView("dpi")
df2.createOrReplaceTempView("usercat")
finaldf = spark_session.sql('''
SQL QUERY
''')
finaldf.coalesce(10).write.format("com.databricks.spark.csv").option("header", "true").option('sep', '\t').mode('append').save('hdfs://hdfs path')
Read it as String type and then convert to int.
df.withColumn("field3",df.field3.cast("int"))

Using when statement in pyspark - not working when I add to various parts of the script

I'm new to PySpark but have managed to get the below working.
I have 2 more requirements though, both of which I would achieve through a case statement in SQL.
I've tried the below:
sqlfunc\
.when((df5.time_minute > 0) &(df5.time_minute < 16) , “Q1” )\
.when((df5.time_minute > 15) &(df5.time_minute < 31) , “Q2” )\
.when((df5.time_minute > 30) &(df5.time_minute < 46) , “Q3” )\
.when((df5.time_minute > 45) &(df5.time_minute < 61) , “Q4” )\
.otherwise("Unknown")\
.alias("Quarter"))
I've tried adding this as a withColumn() condition and also in the select. but either way, it doesn't create a new column with the result in it.
Would anyone be able to advise me how I would go about adding a case statement to the script, so that the output provides a new column. As I say, i have tried
- withColumn('ColumnName', when statement....)
- during the select('field1', 'field2', when statement)
Any help would be great
from pyspark.sql import SparkSession
import pyspark.sql.functions as sqlfunc
from pyspark.sql.types import *
import argparse, sys
from pyspark.sql import *
import pyspark.sql.functions as sqlfunc
from datetime import datetime
#create a context that supports hive
def create_session(appname):
spark_session = SparkSession\
.builder\
.appName(appname)\
.master('yarn')\
.config("hive.metastore.uris", "thrift:IP:9083")\
.enableHiveSupport()\
.getOrCreate()
return spark_session
### START MAIN ###
if __name__ == '__main__':
spark_session = create_session('testing_files')
dt_now = datetime.now()
today_unixtime = long(dt_now.strftime('%s'))
today_date = datetime.fromtimestamp(today_unixtime).strftime('%Y%m%d')
twoday_unixtime = long(dt_now.strftime('%s')) - 24*60*60*2
twoday = datetime.fromtimestamp(twoday_unixtime).strftime('%Y%m%d')
hourago = long(dt_now.strftime('%s')) - 60*60*4
hrdate = datetime.fromtimestamp(hourago).strftime('%H')
schema = [\
StructField('field1', StringType(), True),\
StructField('field2',StringType(), True), \
StructField('field3', StringType(), True), \
StructField('field4',LongType(), True) \
]
final_structure = StructType(schema)
df1 = spark_session.read\
.option("header","false")\
.option("delimiter", "\t")\
.csv('hdfs://directory/dt=%s/*/*/*' %today_date, final_structure)
usercatschema = [\
StructField('field1', StringType(), True),\
StructField('field2',StringType(), True), \
StructField('field3',StringType(), True) \
]
usercat_structure = StructType(usercatschema)
df2 = spark_session.read\
.option("header","false")\
.option("delimiter", "\t")\
.csv('hdfs://directory/dt=%s/*' %twoday, usercat_structure)
df3 = df2.select('field1','field2', 'field3')
df4= df1.join(df3,(df1.field1==df3.field1)&(sqlfunc.substring(df1.field2, 0, 14)==df3.field2),"left")
df5 = df4\
.coalesce(1000)\
.select('df1.field1','df2.field1', ......)\
.groupBy('field1','field2'....)\
.agg(
sqlfunc.sum(df4.field1).alias('upload'),\
sqlfunc.sum(df4.field2).alias('download'),\
sqlfunc.countDistinct(df4.field3).alias('distinct_field3'),\
sqlfunc.count(df4.field4).alias('field4')\
)\
.select('field1......)
df5.show()
Here is the working script:
df5 = df4\
.coalesce(1000)\
.withColumn('quarter',\
sqlfunc.when((df4.time_minute >-1 ) & (df4.time_minute < 16), 1)\
.when((df4.time_minute >15 ) & (df4.time_minute < 31), 2)\
.when((df4.time_minute >30 ) & (df4.time_minute < 46), 3)\
.when((df4.time_minute >45 ) & (df4.time_minute < 61), 4)\
.otherwise(5))\
.select('field1','field2', 'date', 'time_hour', 'time_minute')\
.groupBy('date', 'time_hour', 'quarter')\
.agg(
sqlfunc.sum(df4.field1).alias('sumfield1'),\
sqlfunc.sum(df4.field2).alias('sumfield2'),\
)\
.select('date', 'time_hour', 'quarter', 'sumfield1', 'sumfield2')
df5.show()

Resources