I want to replicate sql OUTER APPLY functinally in pyspark - apache-spark

I want to replicate what "OUTER APPLY" function in pyspark.
Here are my example data frames
## Deparment table
data = [
(1,'Engineering'),
(2,'Administration'),
(3,'Sales'),
(4,'Marketing'),
(5,'Finance')
]
schema = StructType([
StructField('DepartmentID', IntegerType(), True),
StructField('Name', StringType(), True)
])
Department = spark.createDataFrame(data=data, schema =schema)
Department.show()
+------------+--------------+
|DepartmentID| Name|
+------------+--------------+
| 1| Engineering|
| 2|Administration|
| 3| Sales|
| 4| Marketing|
| 5| Finance|
+------------+--------------+
## Employee table
data = [
(1,'Orlando', 'Gee', 1),
(2,'Keith', 'Harris', 2),
(3,'Donna', 'Carreras', 3),
(4,'Janet', 'Gates', 3),
]
schema = StructType([
StructField('EmployeeID', IntegerType(), True),
StructField('FirstName', StringType(), True),
StructField('LastName', StringType(), True),
StructField('DepartmentID', IntegerType(), True),
])
Employee = spark.createDataFrame(data=data, schema =schema)
Employee.show()
+----------+---------+--------+------------+
|EmployeeID|FirstName|LastName|DepartmentID|
+----------+---------+--------+------------+
| 1| Orlando| Gee| 1|
| 2| Keith| Harris| 2|
| 3| Donna|Carreras| 3|
| 4| Janet| Gates| 3|
+----------+---------+--------+------------+
I tried creating a temp table and use a spark SQL command to query as we normally do on temporary tables... but I keep getting
`[PARSE_SYNTAX_ERROR] Syntax error at or near 'OUTER'(line 3, pos 2)
== SQL ==
SELECT * FROM Department D
OUTER APPLY
--^^^
(
SELECT * FROM Employee E
WHERE E.DepartmentID = D.DepartmentID
) A
`
error. Any help is appreciated.
Employee.createOrReplaceTempView("Employee")
Department.createOrReplaceTempView("Department")
sql_query = """
SELECT * FROM Department D
OUTER APPLY
(
SELECT * FROM Employee E
WHERE E.DepartmentID = D.DepartmentID
) A
"""
result_df = sqlContext.sql(sql_query)

OUTER APPLY is not an option in the Spark SQL Syntax
However an OUTER APPLY command will produce the same results as an LEFT OUTER JOIN.
LEFT OUTER JOIN is an option in the Spark SQL Syntax.
Using LEFT OUTER JOIN for your example would look like this in the Spark SQL Syntax,
sql_query = """
SELECT * FROM Department D
LEFT OUTER JOIN Employee E ON E.DepartmentID = D.DepartmentID
"""
Using LEFT OUTER JOIN for your example would look like this in the PySpark Syntax,
Department.join(Employee, Employee.DepartmentID == Department.DepartmentID, "left_outer") \
.show(truncate=False)

Related

Pyspark: add one row dynamically into the final dataframe

I've a final dataframe with this format:
Product_ID: string
Product_COD: string
Product_NAM: string
Product_VER: integer
ProductLine_NAM: string
Language_COD: string
ProductType_NAM: string
Load_DAT: integer
LoadEnd_DAT:integer
edmChange_DTT: timestamp
and I want to add a new row to that dataframe where the ID (Product_ID) is -1 and in the string columns insert 'Unknown' and in the remaining datatypes set to "null" for example:
I created this code:
id_column = "Product_ID"
df_lessOne = spark.createDataFrame(["-1"], "string").toDF(id_column) #create a new id_column row with -1
appended_df = finalDf.unionByName(df_lessOne, allowMissingColumns=True) #add the rest columns of dataframe with nulls
appended_df_filter = appended_df.filter(""+ id_column + " = '-1'")
columns = [item[0] for item in appended_df_filter.dtypes if item[1].startswith('string')] #select only string columns
# replace string columns with "Unknown"
for c_na in columns:
appended_df_filter = (appended_df_filter
.filter(""+ id_column + " = '-1'")
.withColumn(c_na, lit('Unknown'))
)
appended_df = appended_df.filter(""+ id_column + " <> '-1'")
dfs = [appended_df, appended_df_filter]
#add final -1 row to the final dataframe
finalDf = reduce(DataFrame.unionAll, dfs)
display(finalDf)
but unfortunately, it's not working well.
I'm trying to create this dynamically because after I want to use it in other dataframes. I just need to change the id_column after.
Can anyone please help me in achieving this
Thank you!
from pyspark.sql.types import *
from datetime import datetime
import pyspark.sql.functions as F
data2 = [
("xp3980","2103","Product_1",1,"PdLine_23","XX1","PNT_1",2,36636,datetime.strptime('2020-08-20 10:00:00', '%Y-%m-%d %H:%M:%S')),
("gi9387","2411","Product_2",1,"PdLine_44","YT89","PNT_6",2,35847,datetime.strptime('2021-07-21 7:00:00', '%Y-%m-%d %H:%M:%S'))
]
schema = StructType([ \
StructField("Product_ID",StringType(),True), \
StructField("Product_COD",StringType(),True), \
StructField("Product_NAM",StringType(),True), \
StructField("Product_VER", IntegerType(),True), \
StructField("ProductLine_NAM", StringType(), True), \
StructField("Language_COD", StringType(), True), \
StructField("ProductType_NAM", StringType(), True), \
StructField("Load_DAT", IntegerType(), True), \
StructField("LoadEnd_DAT", IntegerType(), True), \
StructField("edmChange_DTT", TimestampType(), True) \
])
my_df = spark.createDataFrame(data=data2,schema=schema)
df_res = spark.createDataFrame([(-1,)]).toDF("Product_ID")
for c in my_df.schema:
if str(c.name) == 'Product_ID':
continue
if str(c.dataType) == 'StringType':
df_res = df_res.withColumn(c.name, F.lit('Unknown'))
else:
df_res = df_res.withColumn(c.name, F.lit(None))
my_df.union(df_res).show()
+----------+-----------+-----------+-----------+---------------+------------+---------------+--------+-----------+-------------------+
# |Product_ID|Product_COD|Product_NAM|Product_VER|ProductLine_NAM|Language_COD|ProductType_NAM|Load_DAT|LoadEnd_DAT| edmChange_DTT|
# +----------+-----------+-----------+-----------+---------------+------------+---------------+--------+-----------+-------------------+
# | xp3980| 2103| Product_1| 1| PdLine_23| XX1| PNT_1| 2| 36636|2020-08-20 10:00:00|
# | gi9387| 2411| Product_2| 1| PdLine_44| YT89| PNT_6| 2| 35847|2021-07-21 07:00:00|
# | -1| Unknown| Unknown| null| Unknown| Unknown| Unknown| null| null| null|
# +----------+-----------+-----------+-----------+---------------+------------+---------------+--------+-----------+-------------------+

How to translate SQL UPDATE query which uses inner join into PySpark?

I have two MS Access SQL queries which I want to convert into PySpark. The queries look like this (we have two tables Employee and Department):
UPDATE EMPLOYEE INNER JOIN [DEPARTMENT] ON
EMPLOYEE.STATEPROVINCE = [DEPARTMENT].[STATE_LEVEL]
SET EMPLOYEE.STATEPROVINCE = [DEPARTMENT]![STATE_ABBREVIATION];
UPDATE EMPLOYEE INNER JOIN [DEPARTMENT] ON
EMPLOYEE.STATEPROVINCE = [DEPARTMENT].[STATE_LEVEL]
SET EMPLOYEE.MARKET = [DEPARTMENT]![MARKET];
Test dataframes:
from pyspark.sql import functions as F
df_emp = spark.createDataFrame([(1, 'a'), (2, 'bb')], ['EMPLOYEE', 'STATEPROVINCE'])
df_emp.show()
# +--------+-------------+
# |EMPLOYEE|STATEPROVINCE|
# +--------+-------------+
# | 1| a|
# | 2| bb|
# +--------+-------------+
df_dept = spark.createDataFrame([('bb', 'b')], ['STATE_LEVEL', 'STATE_ABBREVIATION'])
df_dept.show()
# +-----------+------------------+
# |STATE_LEVEL|STATE_ABBREVIATION|
# +-----------+------------------+
# | bb| b|
# +-----------+------------------+
Running your SQL query in Microsoft Access does the following:
In PySpark, you can get it like this:
df = (df_emp.alias('a')
.join(df_dept.alias('b'), df_emp.STATEPROVINCE == df_dept.STATE_LEVEL, 'left')
.select(
*[c for c in df_emp.columns if c != 'STATEPROVINCE'],
F.coalesce('b.STATE_ABBREVIATION', 'a.STATEPROVINCE').alias('STATEPROVINCE')
)
)
df.show()
# +--------+-------------+
# |EMPLOYEE|STATEPROVINCE|
# +--------+-------------+
# | 1| a|
# | 2| b|
# +--------+-------------+
First you do a left join. Then, select.
The select has 2 parts.
First, you select everything from df_emp except for "STATEPROVINCE".
Then, for the new "STATEPROVINCE", you select "STATE_ABBREVIATION" from df_dept, but in case it's null (i.e. not existent in df_dept), you take "STATEPROVINCE" from df_emp.
For your second query, you only need to change values in the select statement:
df = (df_emp.alias('a')
.join(df_dept.alias('b'), df_emp.STATEPROVINCE == df_dept.STATE_LEVEL, 'left')
.select(
*[c for c in df_emp.columns if c != 'MARKET'],
F.coalesce('b.MARKET', 'a.MARKET').alias('MARKET')
)
)

Merging rows to map type based on max value in a column

I am doing a small POC to ingest the user events(CSV file) from a website. Below is the sample input:
Input Schema:
Output should be in the format as below
The logic required is to group by the id column and merge the
name and value columns to a Map type where the name column represents the key
and the value column represent the value in the Map type. The value to be picked for each key in the Map is the one with the highest value in the timestamp column.
I was able to achieve some part where it needs to be grouped by id and extract maximum of the timestamp column.I am facing difficulty with selecting one value(from corresponding max timestamp) for each id) and merge with other names(using map).
Below is my code
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
val schema = StructType(List(
StructField("id", LongType, nullable = true),
StructField("name", StringType, nullable = true),
StructField("value", StringType, nullable = true),
StructField("timestamp", LongType, nullable = true)))
val myDF = spark.read.schema(schema).option("header", "true").option("delimiter", ",").csv("wasbs:///HdiSamples/HdiSamples/SensorSampleData/hvac/tru.csv")
val df = myDF.toDF("id","name","value","timestamp")
//df.groupBy("id","name","value").agg(max("timestamp")).show()
val windowSpecAgg = Window.partitionBy("id")
df.withColumn("max", max(col("timestamp")).over(windowSpecAgg)).where(col("timestamp") === col("max")).drop("max").show()
Use window function and filter out latest data by partitioning on "id","name"
later use map_from_arrays,to_json functions to recreate the desired json.
Example:
df.show()
//sample data
//+---+----+-------+---------+
//| id|name| value|timestamp|
//+---+----+-------+---------+
//| 1| A| Exited| 3201|
//| 1| A|Running| 5648|
//| 1| C| Exited| 3547|
//| 2| C|Success| 3612|
//+---+----+-------+---------+
val windowSpecAgg = Window.partitionBy("id","name").orderBy(desc("timestamp"))
df.withColumn("max", row_number().over(windowSpecAgg)).filter(col("max")===1).
drop("max").
groupBy("id").
agg(to_json(map_from_arrays(collect_list(col("name")),collect_list(col("value")))).as("settings")).
show(10,false)
//+---+----------------------------+
//|id |settings |
//+---+----------------------------+
//|1 |{"A":"Running","C":"Exited"}|
//|2 |{"C":"Success"} |
//+---+----------------------------+
You can use ranking function - row_number() to get the latest records per partition.
val spark = SparkSession.builder().master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
val df = Seq((1, "A", "Exited", 1546333201),
(3, "B", "Failed", 1546334201),
(2, "C", "Success", 1546333612),
(3, "B", "Hold", 1546333444),
(1, "A", "Running", 1546335648),
(1, "C", "Exited", 1546333547)).toDF("id", "name", "value", "timestamp")
df.withColumn("rn",
row_number().over(Window.partitionBy("id", "name").orderBy('timestamp.desc_nulls_last)))
.where('rn === 1)
.drop("rn")
.groupBy("id")
.agg(collect_list(map('name, 'value)).as("settings"))
.show(false)
/*
+---+-------------------------------+
|id |settings |
+---+-------------------------------+
|1 |[[A -> Running], [C -> Exited]]|
|3 |[[B -> Failed]] |
|2 |[[C -> Success]] |
+---+-------------------------------+ */

How do you remove an ambiguous column in pyspark?

There are many questions similar to this that are asking a different question with regard to avoid duplicate columns in a join; that is not what I am asking here.
Given that I already have a DataFrame with ambiguous columns, how do I remove a specific column?
For example, given:
df = spark.createDataFrame(
spark.sparkContext.parallelize([
[1, 0.0, "ext-0.0"],
[1, 1.0, "ext-1.0"],
[2, 1.0, "ext-2.0"],
[3, 2.0, "ext-3.0"],
[4, 3.0, "ext-4.0"],
]),
StructType([
StructField("id", IntegerType(), True),
StructField("shared", DoubleType(), True),
StructField("shared", StringType(), True),
])
)
I wish to retain only the numeric columns.
However, attempting to do something like df.select("id", "shared").show() results in:
raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: "Reference 'shared' is ambiguous, could be: shared, shared.;"
Many related solution to this problem are simply 'avoid ever getting into this situation', eg. by using ['joinkey'] instead of a.joinkey = b.joinkey on the join. I reiterate that this is not the situation here; this relates to a dataframe that has already been converted into this form.
The metadata from the DF disambiguates these columns:
$ df.dtypes
[('id', 'int'), ('shared', 'double'), ('shared', 'string')]
$ df.schema
StructType(List(StructField(id,IntegerType,true),StructField(shared,DoubleType,true),StructField(shared,StringType,true)))
So the data is retained internally... I just can't see how to use it.
How do I pick one column over the other?
I expected to be able to use, eg. col('shared#11') or similar... but there is nothing like that I can see?
Is this simply not possible in spark?
To answer this question, I would ask, please post either a) a working code snippet that solves the problem above, or b) link to something official from the spark developers that this simply isn't supported?
the easiest solution to this problem is to rename using df.toDF(...<new-col-names>...), but if you don't wanted to change the column name then group the duplicated columns by their type as struct<type1, type2> as below-
Please note that below solution is written in scala, but logically similar code can be implemented in python. Also this solution will work for all duplicate columns in the dataframe-
1. Load the test data
val df = Seq((1, 2.0, "shared")).toDF("id", "shared", "shared")
df.show(false)
df.printSchema()
/**
* +---+------+------+
* |id |shared|shared|
* +---+------+------+
* |1 |2.0 |shared|
* +---+------+------+
*
* root
* |-- id: integer (nullable = false)
* |-- shared: double (nullable = false)
* |-- shared: string (nullable = true)
*/
2. get all the duplicated column names
// 1. get all the duplicated column names
val findDupCols = (cols: Array[String]) => cols.map((_ , 1)).groupBy(_._1).filter(_._2.length > 1).keys.toSeq
val dupCols = findDupCols(df.columns)
println(dupCols.mkString(", "))
// shared
3. rename duplicate cols like shared => shared:string, shared:int, without touching the other column names
val renamedDF = df
// 2 rename duplicate cols like shared => shared:string, shared:int
.toDF(df.schema
.map{case StructField(name, dt, _, _) =>
if(dupCols.contains(name)) s"$name:${dt.simpleString}" else name}: _*)
3. create struct of all cols
// 3. create struct of all cols
val structCols = df.schema.map(f => f.name -> f ).groupBy(_._1)
.map{case(name, seq) =>
if (seq.length > 1)
struct(
seq.map { case (_, StructField(fName, dt, _, _)) =>
expr(s"`$fName:${dt.simpleString}` as ${dt.simpleString}")
}: _*
).as(name)
else col(name)
}.toSeq
val structDF = renamedDF.select(structCols: _*)
structDF.show(false)
structDF.printSchema()
/**
* +-------------+---+
* |shared |id |
* +-------------+---+
* |[2.0, shared]|1 |
* +-------------+---+
*
* root
* |-- shared: struct (nullable = false)
* | |-- double: double (nullable = false)
* | |-- string: string (nullable = true)
* |-- id: integer (nullable = false)
*/
4. get column by their type using <column_name>.<datatype>
// Use the dataframe without losing any columns
structDF.selectExpr("id", "shared.double as shared").show(false)
/**
* +---+------+
* |id |shared|
* +---+------+
* |1 |2.0 |
* +---+------+
*/
Hope this is useful to someone!
It seems this is possible by replacing the schema using .rdd.toDf() on the dataframe.
However, I'll still accept any answer that is less convoluted and annoying than the one below:
import random
import string
from pyspark.sql.types import DoubleType, LongType
def makeId():
return ''.join(random.choice(string.ascii_lowercase) for _ in range(6))
def makeUnique(column):
return "%s---%s" % (column.name, makeId())
def makeNormal(column):
return column.name.split("---")[0]
unique_schema = list(map(makeUnique, df.schema))
df_unique = df.rdd.toDF(schema=unique_schema)
df_unique.show()
numeric_cols = filter(lambda c: c.dataType.__class__ in [LongType, DoubleType], df_unique.schema)
numeric_col_names = list(map(lambda c: c.name, numeric_cols))
df_filtered = df_unique.select(*numeric_col_names)
df_filtered.show()
normal_schema = list(map(makeNormal, df_filtered.schema))
df_fixed = df_filtered.rdd.toDF(schema=normal_schema)
df_fixed.show()
Gives:
+-----------+---------------+---------------+
|id---chjruu|shared---aqboua|shared---ehjxor|
+-----------+---------------+---------------+
| 1| 0.0| ext-0.0|
| 1| 1.0| ext-1.0|
| 2| 1.0| ext-2.0|
| 3| 2.0| ext-3.0|
| 4| 3.0| ext-4.0|
+-----------+---------------+---------------+
+-----------+---------------+
|id---chjruu|shared---aqboua|
+-----------+---------------+
| 1| 0.0|
| 1| 1.0|
| 2| 1.0|
| 3| 2.0|
| 4| 3.0|
+-----------+---------------+
+---+------+
| id|shared|
+---+------+
| 1| 0.0|
| 1| 1.0|
| 2| 1.0|
| 3| 2.0|
| 4| 3.0|
+---+------+
Workaround: Simply rename the columns (in order) and then do whatever you wanted to do!
renamed_df = df.toDF("id", "shared_double", "shared_string")

Set schema in pyspark dataframe read.csv with null elements

I have a data set (example) that when imported with
df = spark.read.csv(filename, header=True, inferSchema=True)
df.show()
will assign the column with 'NA' as a stringType(), where I would like it to be IntegerType() (or ByteType()).
I then tried to set
schema = StructType([
StructField("col_01", IntegerType()),
StructField("col_02", DateType()),
StructField("col_03", IntegerType())
])
df = spark.read.csv(filename, header=True, schema=schema)
df.show()
The output shows the entire row with 'col_03' = null to be null.
However col_01 and col_02 return appropriate data if they are called with
df.select(['col_01','col_02']).show()
I can find a way around this by post casting the data type of col_3
df = spark.read.csv(filename, header=True, inferSchema=True)
df = df.withColumn('col_3',df['col_3'].cast(IntegerType()))
df.show()
, but I think it is not ideal and would be much better if I can assign the data type for each column directly with setting schema.
Would anyone be able to guide me what I do incorrectly? Or casting the data types after importing is the only solution? Any comment regarding performance of the two approaches (if we can make assigning schema to work) is also welcome.
Thank you,
You can set a new null value in spark's csv loader using nullValue:
for a csv file looking like this:
col_01,col_02,col_03
111,2007-11-18,3
112,2002-12-03,4
113,2007-02-14,5
114,2003-04-16,NA
115,2011-08-24,2
116,2003-05-03,3
117,2001-06-11,4
118,2004-05-06,NA
119,2012-03-25,5
120,2006-10-13,4
and forcing schema:
from pyspark.sql.types import StructType, IntegerType, DateType
schema = StructType([
StructField("col_01", IntegerType()),
StructField("col_02", DateType()),
StructField("col_03", IntegerType())
])
You'll get:
df = spark.read.csv(filename, header=True, nullValue='NA', schema=schema)
df.show()
df.printSchema()
+------+----------+------+
|col_01| col_02|col_03|
+------+----------+------+
| 111|2007-11-18| 3|
| 112|2002-12-03| 4|
| 113|2007-02-14| 5|
| 114|2003-04-16| null|
| 115|2011-08-24| 2|
| 116|2003-05-03| 3|
| 117|2001-06-11| 4|
| 118|2004-05-06| null|
| 119|2012-03-25| 5|
| 120|2006-10-13| 4|
+------+----------+------+
root
|-- col_01: integer (nullable = true)
|-- col_02: date (nullable = true)
|-- col_03: integer (nullable = true)
Try this once - (But this will read every column as string type. You can type caste as per your requirement)
import csv
from pyspark.sql.types import IntegerType
data = []
with open('filename', 'r' ) as doc:
reader = csv.DictReader(doc)
for line in reader:
data.append(line)
df = sc.parallelize(data).toDF()
df = df.withColumn("col_03", df["col_03"].cast(IntegerType()))

Resources