Convert dataframe into array of nested json object in pyspark - apache-spark

I have created dataframe as follows :
+----+-------+-------+
| age| number|name |
+----+-------+-------+
| 16| 12|A |
| 16| 13|B |
| 17| 16|E |
| 17| 17|F |
+----+-------+-------+
How to convert it into following json:
{
'age' : 16,
'values' : [{‘number’: ‘12’ , ‘name’ : 'A'},{‘number’: ‘12’ , ‘name’ : 'A'} ]
},{
'age' : 17,
'values' : [{‘number’: ‘16’ , ‘name’ : 'E'},{‘number’: ‘17’ , ‘name’ : 'F'} ]
}

assuming df is your dataframe,
from pyspark.sql import functions as F
new_df = df.select(
"age",
F.struct(
F.col("number"),
F.col("name"),
).alias("values")
).groupBy(
"age"
).agg(
F.collect_list("values").alias("values")
)
new_df.toJSON()
# or
new_df.write.json(...)

You can convert the DF to RDD and apply your transformations:
NewSchema = StructType([StructField("age", IntegerType())
, StructField("values", StringType())
])
res_df = df.rdd.map(lambda row: (row[0], ([{'number':row[1], 'name':row[2]}])))\
.reduceByKey(lambda x, y: x + y)\
.map(lambda row: (row[0], json.dumps(row[1])))\
.toDF(NewSchema)
res_df.show(20, False)
Show res_df:
+---+------------------------------------------------------------+
|age|values |
+---+------------------------------------------------------------+
|16 |[{"number": 12, "name": "A"}, [{"number": 13, "name": "B"}] |
|17 |[{"number": 17, "name": "F"}, [{"number": 16, "name": "E"}] |
+---+------------------------------------------------------------+
Saving the DF as JSON File:
res_df.coalesce(1).write.format('json').save('output.json')

Related

Sum of array of dictionaries depending on value condition pyspark (spark structured streaming)

I have the following schema
tick_by_tick_schema = StructType([
StructField('localSymbol', StringType()),
StructField('time', StringType()),
StructField('open', StringType()),
StructField('previous_price', StringType()),
StructField('tickByTicks', ArrayType(StructType([
StructField('price', StringType()),
StructField('size', StringType()),
StructField('specialConditions', StringType()),
])))
])
and I have the following dataframe (in spark structured streaming):
+-----------+--------------------------------+--------------+----------------------------------------------------+
|localSymbol|time |previous_price|tickByTicks |
+-----------+--------------------------------+--------------+----------------------------------------------------+
|BABA |2021-06-10 19:25:38.154245+00:00|213.76 |[{213.75, 100, }] |
|BABA |2021-06-10 19:25:38.155229+00:00|213.76 |[{213.75, 100, }, {213.78, 100, }, {213.78, 200, }] |
|BABA |2021-06-10 19:25:39.662033+00:00|213.73 |[{213.72, 100, }] |
|BABA |2021-06-10 19:25:39.662655+00:00|213.72 |[{213.72, 100, }, {213.73, 100, }] |
+-----------+--------------------------------+--------------+----------------------------------------------------+
I would like to create two columns depending on the next logic:
Column_low: WHEN tickByTicks.price < previous_price THEN sum(tickByTicks.size)
Column_high: when tickByTicks.price > previous_price THEN sum(tickByTicks.size)
the result will be:
+-----------+--------------------------------+--------------+----------------------------------------------------+----------+-----------+
|localSymbol|time |previous_price|tickByTicks |Column_low|Column_high|
+-----------+--------------------------------+--------------+----------------------------------------------------+----------+-----------+
|BABA |2021-06-10 19:25:38.154245+00:00|213.76 |[{213.75, 100, }] |100 |0 |
|BABA |2021-06-10 19:25:38.155229+00:00|213.76 |[{213.75, 100, }, {213.78, 100, }, {213.78, 200, }] |100 |300 |
|BABA |2021-06-10 19:25:39.662033+00:00|213.73 |[{213.72, 100, }] |100 |0 |
|BABA |2021-06-10 19:25:39.662655+00:00|213.72 |[{213.72, 100, }, {213.73, 100, }] |0 |100 |
+-----------+--------------------------------+--------------+----------------------------------------------------+----------+-----------+
I have tried to do something similar but I have not achieved the expected result
tick_by_tick_data_processed = kafka_df_structured_with_tick_by_tick_data_values.select(
f.col('localSymbol'),
f.col('time'),
f.col('previous_price'),
f.col('tickByTicks'),
f.expr("aggregate(filter(tickByTicks.size, x -> x > previous_price), 0D, (x, acc) -> acc + x)")
).show(30,False)
I can't test my solution, but I think this may work:
tick_by_tick_data_processed = kafka_df_structured_with_tick_by_tick_data_values.select(
f.col('localSymbol'),
f.col('time'),
f.col('previous_price'),
f.col('tickByTicks'),
f.expr("aggregate(tickByTicks, 0D, (acc, tick) -> IF(tick.price < previous_price, acc + tick.size, acc))").alias("Column_low"),
f.expr("aggregate(tickByTicks, 0D, (acc, tick) -> IF(tick.price > previous_price, acc + tick.size, acc))").alias("Column_high"))
This works using explode and sum function
from pyspark.sql.window import Window
import pyspark.sql.functions as f
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import explode
data = [
("BABA", "2021-06-10 19:25:38.154245+00:00" ,"213.76" ,[("213.75", "100")] ),
("BABA", "2021-06-10 19:25:38.155229+00:00" ,"213.76" ,[("213.75", "100"),("213.78", "100"),("213.78", "200")] ),
("BABA", "2021-06-10 19:25:39.662033+00:00" ,"213.73" ,[("213.72", "100")] ),
("BABA", "2021-06-10 19:25:39.662655+00:00" ,"213.72" ,[("213.72", "100"),("213.73", "100")] ),
]
tick_by_tick_schema = StructType([
StructField('localSymbol', StringType()),
StructField('time', StringType()),
StructField('previous_price', StringType()),
StructField('tickByTicks', ArrayType(StructType([
StructField('price', StringType()),
StructField('size', StringType())
])))
])
df = spark.createDataFrame(data=data, schema=tick_by_tick_schema)
df = df.withColumn("idx", monotonically_increasing_id())
df=df.withColumn("col3", explode(df.tickByTicks))
df.createOrReplaceTempView("calc")
spark.sql("select localSymbol,time,previous_price,idx,tickByTicks, sum (case when col3.price < previous_price then col3.size else 0 end) as Column_low ,sum(case when col3.price > previous_price then col3.size else 0 end) as Column_low from calc group by localSymbol,time,previous_price,idx,tickByTicks ").drop("idx").show(truncate=0)
Results
+-----------+--------------------------------+--------------+---------------------------------------------+----------+----------+
|localSymbol|time |previous_price|tickByTicks |Column_low|Column_low|
+-----------+--------------------------------+--------------+---------------------------------------------+----------+----------+
|BABA |2021-06-10 19:25:39.662033+00:00|213.73 |[[213.72, 100]] |100.0 |0.0 |
|BABA |2021-06-10 19:25:38.154245+00:00|213.76 |[[213.75, 100]] |100.0 |0.0 |
|BABA |2021-06-10 19:25:39.662655+00:00|213.72 |[[213.72, 100], [213.73, 100]] |0.0 |100.0 |
|BABA |2021-06-10 19:25:38.155229+00:00|213.76 |[[213.75, 100], [213.78, 100], [213.78, 200]]|100.0 |300.0 |
+-----------+--------------------------------+--------------+---------------------------------------------+----------+----------+
>>>

Spark Aggregating multiple columns (possible to array) from join output

I've below datasets
Table1
Table2
Now I would like to get below dataset. I've tried with left outer join Table1.id == Table2.departmentid but, I am not getting the desired output.
Later, I need to use this table to get several counts and convert the data into an xml . I will be doing this convertion using map.
Any help would be appreciated.
Only joining is not enough to get the desired output. Probably You are missing something and last element of each nested array might be departmentid. Assuming the last element of nested array is departmentid, I've generated the output by the following way:
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.functions.collect_list
case class department(id: Integer, deptname: String)
case class employee(employeid:Integer, empname:String, departmentid:Integer)
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val department_df = Seq(department(1, "physics")
,department(2, "computer") ).toDF()
val emplyoee_df = Seq(employee(1, "A", 1)
,employee(2, "B", 1)
,employee(3, "C", 2)
,employee(4, "D", 2)).toDF()
val result = department_df.join(emplyoee_df, department_df("id") === emplyoee_df("departmentid"), "left").
selectExpr("id", "deptname", "employeid", "empname").
rdd.map {
case Row(id:Integer, deptname:String, employeid:Integer, empname:String) => (id, deptname, Array(employeid.toString, empname, id.toString))
}.toDF("id", "deptname", "arrayemp").
groupBy("id", "deptname").
agg(collect_list("arrayemp").as("emplist")).
orderBy("id", "deptname")
The output looks like this:
result.show(false)
+---+--------+----------------------+
|id |deptname|emplist |
+---+--------+----------------------+
|1 |physics |[[2, B, 1], [1, A, 1]]|
|2 |computer|[[4, D, 2], [3, C, 2]]|
+---+--------+----------------------+
Explanation: If i break down the last dataframe transformation into multiple steps, it'll probably make clear how the output is generated.
left outer join between department_df and employee_df
val df1 = department_df.join(emplyoee_df, department_df("id") === emplyoee_df("departmentid"), "left").
selectExpr("id", "deptname", "employeid", "empname")
df1.show()
+---+--------+---------+-------+
| id|deptname|employeid|empname|
+---+--------+---------+-------+
| 1| physics| 2| B|
| 1| physics| 1| A|
| 2|computer| 4| D|
| 2|computer| 3| C|
+---+--------+---------+-------+
creating array using some column's values from the df1 dataframe
val df2 = df1.rdd.map {
case Row(id:Integer, deptname:String, employeid:Integer, empname:String) => (id, deptname, Array(employeid.toString, empname, id.toString))
}.toDF("id", "deptname", "arrayemp")
df2.show()
+---+--------+---------+
| id|deptname| arrayemp|
+---+--------+---------+
| 1| physics|[2, B, 1]|
| 1| physics|[1, A, 1]|
| 2|computer|[4, D, 2]|
| 2|computer|[3, C, 2]|
+---+--------+---------+
create new list aggregating multiple arrays using df2 dataframe
val result = df2.groupBy("id", "deptname").
agg(collect_list("arrayemp").as("emplist")).
orderBy("id", "deptname")
result.show(false)
+---+--------+----------------------+
|id |deptname|emplist |
+---+--------+----------------------+
|1 |physics |[[2, B, 1], [1, A, 1]]|
|2 |computer|[[4, D, 2], [3, C, 2]]|
+---+--------+----------------------+
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
val df = spark.sparkContext.parallelize(Seq(
(1,"Physics"),
(2,"Computer"),
(3,"Maths")
)).toDF("ID","Dept")
val schema = List(
StructField("EMPID", IntegerType, true),
StructField("EMPNAME", StringType, true),
StructField("DeptID", IntegerType, true)
)
val data = Seq(
Row(1,"A",1),
Row(2,"B",1),
Row(3,"C",2),
Row(4,"D",2) ,
Row(5,"E",null)
)
val df_emp = spark.createDataFrame(
spark.sparkContext.parallelize(data),
StructType(schema)
)
val newdf = df_emp.withColumn("CONC",array($"EMPID",$"EMPNAME",$"DeptID")).groupBy($"DeptID").agg(expr("collect_list(CONC) as emplist"))
df.join(newdf,df.col("ID") === df_emp.col("DeptID")).select($"ID",$"Dept",$"emplist").show()
---+--------+--------------------+
| ID| Dept| listcol|
+---+--------+--------------------+
| 1| Physics|[[1, A, 1], [2, B...|
| 2|Computer|[[3, C, 2], [4, D...|

pyspark. zip arrays in a dataframe

I have the following PySpark DataFrame:
+------+----------------+
| id| data |
+------+----------------+
| 1| [10, 11, 12]|
| 2| [20, 21, 22]|
| 3| [30, 31, 32]|
+------+----------------+
At the end, I want to have the following DataFrame
+--------+----------------------------------+
| id | data |
+--------+----------------------------------+
| [1,2,3]|[[10,20,30],[11,21,31],[12,22,32]]|
+--------+----------------------------------+
I order to do this. First I extract the data arrays as follow:
tmp_array = df_test.select("data").rdd.flatMap(lambda x: x).collect()
a0 = tmp_array[0]
a1 = tmp_array[1]
a2 = tmp_array[2]
samples = zip(a0, a1, a2)
samples1 = sc.parallelize(samples)
In this way, I have in samples1 an RDD with the content
[[10,20,30],[11,21,31],[12,22,32]]
Question 1: Is that a good way to do it?
Question 2: How to include that RDD back into the dataframe?
Here is a way to get your desired output without serializing to rdd or using a udf. You will need two constants:
The number of rows in your DataFrame (df.count())
The length of data (given)
Use pyspark.sql.functions.collect_list() and pyspark.sql.functions.array() in a double list comprehension to pick out the elements of "data" in the order you want using pyspark.sql.Column.getItem():
import pyspark.sql.functions as f
dataLength = 3
numRows = df.count()
df.select(
f.collect_list("id").alias("id"),
f.array(
[
f.array(
[f.collect_list("data").getItem(j).getItem(i)
for j in range(numRows)]
)
for i in range(dataLength)
]
).alias("data")
)\
.show(truncate=False)
#+---------+------------------------------------------------------------------------------+
#|id |data |
#+---------+------------------------------------------------------------------------------+
#|[1, 2, 3]|[WrappedArray(10, 20, 30), WrappedArray(11, 21, 31), WrappedArray(12, 22, 32)]|
#+---------+------------------------------------------------------------------------------+
You can simply use a udf function for the zip function but before that you will have to use collect_list function
from pyspark.sql import functions as f
from pyspark.sql import types as t
def zipUdf(array):
return zip(*array)
zipping = f.udf(zipUdf, t.ArrayType(t.ArrayType(t.IntegerType())))
df.select(
f.collect_list(df.id).alias('id'),
zipping(f.collect_list(df.data)).alias('data')
).show(truncate=False)
which would give you
+---------+------------------------------------------------------------------------------+
|id |data |
+---------+------------------------------------------------------------------------------+
|[1, 2, 3]|[WrappedArray(10, 20, 30), WrappedArray(11, 21, 31), WrappedArray(12, 22, 32)]|
+---------+------------------------------------------------------------------------------+

Pypsark - Retain null values when using collect_list

According to the accepted answer in pyspark collect_set or collect_list with groupby, when you do a collect_list on a certain column, the null values in this column are removed. I have checked and this is true.
But in my case, I need to keep the null columns -- How can I achieve this?
I did not find any info on this kind of a variant of collect_list function.
Background context to explain why I want nulls:
I have a dataframe df as below:
cId | eId | amount | city
1 | 2 | 20.0 | Paris
1 | 2 | 30.0 | Seoul
1 | 3 | 10.0 | Phoenix
1 | 3 | 5.0 | null
I want to write this to an Elasticsearch index with the following mapping:
"mappings": {
"doc": {
"properties": {
"eId": { "type": "keyword" },
"cId": { "type": "keyword" },
"transactions": {
"type": "nested",
"properties": {
"amount": { "type": "keyword" },
"city": { "type": "keyword" }
}
}
}
}
}
In order to conform to the nested mapping above, I transformed my df so that for each combination of eId and cId, I have an array of transactions like this:
df_nested = df.groupBy('eId','cId').agg(collect_list(struct('amount','city')).alias("transactions"))
df_nested.printSchema()
root
|-- cId: integer (nullable = true)
|-- eId: integer (nullable = true)
|-- transactions: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- amount: float (nullable = true)
| | |-- city: string (nullable = true)
Saving df_nested as a json file, there are the json records that I get:
{"cId":1,"eId":2,"transactions":[{"amount":20.0,"city":"Paris"},{"amount":30.0,"city":"Seoul"}]}
{"cId":1,"eId":3,"transactions":[{"amount":10.0,"city":"Phoenix"},{"amount":30.0}]}
As you can see - when cId=1 and eId=3, one of my array elements where amount=30.0 does not have the city attribute because this was a null in my original data (df). The nulls are being removed when I use the collect_list function.
However, when I try writing df_nested to elasticsearch with the above index, it errors because there is a schema mismatch. This is basically the reason as to why I want to retain my nulls after applying the collect_list function.
from pyspark.sql.functions import create_map, collect_list, lit, col, to_json, from_json
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, HiveContext, SparkSession, types, Row
from pyspark.sql import functions as f
import os
app_name = "CollList"
conf = SparkConf().setAppName(app_name)
spark = SparkSession.builder.appName(app_name).config(conf=conf).enableHiveSupport().getOrCreate()
df = spark.createDataFrame([[1, 2, 20.0, "Paris"], [1, 2, 30.0, "Seoul"],
[1, 3, 10.0, "Phoenix"], [1, 3, 5.0, None]],
["cId", "eId", "amount", "city"])
print("Actual data")
df.show(10,False)
```
Actual data
+---+---+------+-------+
|cId|eId|amount|city |
+---+---+------+-------+
|1 |2 |20.0 |Paris |
|1 |2 |30.0 |Seoul |
|1 |3 |10.0 |Phoenix|
|1 |3 |5.0 |null |
+---+---+------+-------+
```
#collect_list that skips null columns
df1 = df.groupBy(f.col('city'))\
.agg(f.collect_list(f.to_json(f.struct([f.col(x).alias(x) for x in (c for c in df.columns if c != 'cId' and c != 'eId' )])))).alias('newcol')
print("Collect List Data - Missing Null Columns in the list")
df1.show(10, False)
```
Collect List Data - Missing Null Columns in the list
+-------+-------------------------------------------------------------------------------------------------------------------+
|city |collect_list(structstojson(named_struct(NamePlaceholder(), amount AS `amount`, NamePlaceholder(), city AS `city`)))|
+-------+-------------------------------------------------------------------------------------------------------------------+
|Phoenix|[{"amount":10.0,"city":"Phoenix"}] |
|null |[{"amount":5.0}] |
|Paris |[{"amount":20.0,"city":"Paris"}] |
|Seoul |[{"amount":30.0,"city":"Seoul"}] |
+-------+-------------------------------------------------------------------------------------------------------------------+
```
my_list = []
for x in (c for c in df.columns if c != 'cId' and c != 'eId' ):
my_list.append(lit(x))
my_list.append(col(x))
grp_by = ["eId","cId"]
df_nested = df.withColumn("transactions", create_map(my_list))\
.groupBy(grp_by)\
.agg(collect_list(f.to_json("transactions")).alias("transactions"))
print("collect list after create_map")
df_nested.show(10,False)
```
collect list after create_map
+---+---+--------------------------------------------------------------------+
|eId|cId|transactions |
+---+---+--------------------------------------------------------------------+
|2 |1 |[{"amount":"20.0","city":"Paris"}, {"amount":"30.0","city":"Seoul"}]|
|3 |1 |[{"amount":"10.0","city":"Phoenix"}, {"amount":"5.0","city":null}] |
+---+---+--------------------------------------------------------------------+
```

Spark: To create map type column based value from column using Dataframes

Sample input :
Item_Id Item_Name Buyer's_Id Buyers_Name
0001 Keyboard 10000 ABC
0002 Monitor 10010 XYZ
0001 Keyboard 10005 DXC
Sample intermediate Output:
0001,Keyboard,{"Buyer's_Id":"10000","Buyers_Name":"ABC"}
0002,Monitor,{"Buyer's_Id":"10010","Buyers_Name":"XYZ"}
0001,Keyboard,{"Buyer's_Id":"10005","Buyers_Name":"DXC"}
Final output:
0001,Keyboard,[{"Buyer's_Id":"10000","Buyers_Name":"Abc"},{"Buyer's_Id":"10005","Buyers_Name":"DXC"}]
0002,Monitor,[{"Buyer's_Id":"10010","Buyers_Name":"XYZ"}]
What you want to achieve can be done using
map processing each row
mapPartitions processing each partition
scala> input_df.show
+-------+---------+----------+-----------+
|Item_Id|Item_Name|Buyer's_Id|Buyers_Name|
+-------+---------+----------+-----------+
| 1| Keyboard| 10000| ABC|
| 2| Monitor| 10010| XYZ|
| 1| Keyboard| 10005| DXC|
+-------+---------+----------+-----------+
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.collect_set
Since your intermediate dataframe has different schema. So we need to define that new schema
scala> val schema = StructType(Seq(
| StructField("item_number", IntegerType),
| StructField("item_name", StringType),
| StructField("json_string", StringType)
| ))
scala> val encoder = RowEncoder(schema)
scala> val intermediate_df = input_df.map{row =>
| val itm_nbr = row.getAs[Integer]("Item_Id")
| val itm_nme = row.getAs[String]("Item_Name")
| val byer_id = row.getAs[Integer]("Buyer's_Id")
| val byer_nme = row.getAs[String]("Buyers_Name")
| val req_string = s"""{"Buyer's_id" : $byer_id,"Buyers_Name" : $byer_nme}"""
| Row(itm_nbr,itm_nme,req_string)
| }(encoder)
intermediate_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [item_number: int, item_name: string ... 1 more field]
scala> intermediate_df.show(false)
+-----------+---------+-------------------------------------------+
|item_number|item_name|json_string |
+-----------+---------+-------------------------------------------+
|1 |Keyboard |{"Buyer's_id" : 10000,"Buyers_Name" : ABC}|
|2 |Monitor |{"Buyer's_id" : 10010,"Buyers_Name" : XYZ}|
|1 |Keyboard |{"Buyer's_id" : 10005,"Buyers_Name" : DXC}|
+-----------+---------+-------------------------------------------+
scala> val result_df = intermediate_df.groupBy('item_number,'item_name).agg(collect_set('json_string).as("json_list")).orderBy('item_number)
result_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [item_number: int, item_name: string ... 1 more field]
scala> result_df.show(false)
+-----------+---------+------------------------------------------------------------------------------------------+
|item_number|item_name|json_list |
+-----------+---------+------------------------------------------------------------------------------------------+
|1 |Keyboard |[{"Buyer's_id" : 10000,"Buyers_Name" : ABC}, {"Buyer's_id" : 10005,"Buyers_Name" : DXC}]|
|2 |Monitor |[{"Buyer's_id" : 10010,"Buyers_Name" : XYZ}] |
+-----------+---------+------------------------------------------------------------------------------------------+
Hope this was helpful!
dF.select(
$"Item_Id",
$"Item_Name",
map(
lit("Buyer's_Id"),$"Buyer's_Id",
lit("Buyers_Name"),$"Buyers_Name"
).as("newCol")
).groupBy("Item_Id","Item_Name")
.agg(
collect_set($"newCol").as("mapCol")
).orderBy("Item_Id")

Resources