Pyspark conver the column into list - apache-spark

In my pyspark dataframe, there is a column which is being inferred as string, but it is actually a List. I need to convert the same into list
Sample Value of the column :
[{"#ID":"07D40CB5-273E-4814-B82C-B5DCA7145D20","#ProductName":"Event Registration","#ProductCode":null,"#Type":"Admission Item","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#Status":"Active"},{"CustomFieldDetail":{"#FieldName":"Description","#FieldType":"Open Ended Text - Comment Box","#FieldValue":null,"#FieldId":"C6D46AD1-9B3F-45FF-9331-27EA47811E37"},"#ID":"8EA83E8B-7573-4550-905D-D4320496AD89","#ProductName":"Test","#ProductCode":null,"#Type":"Session","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#StartTime":"2018-01-29T18:00:00","#EndTime":"2018-01-29T19:00:00","#Status":"Active"}]
Please let me know how to achieve this

This is a dynamic solution that first converts the JSON strings column to an RDD, then read the RDD to a dataframe and finally makes use of the inferred schema to read the original dataframe while parsing the JSON strings.
import pyspark.sql.functions as F
import pyspark.sql.types as T
This is not part of the solution, only creation of the sample dataframe
str1 = '''[{"#ID":"str1_1","a":11},{"#ID":"str1_2","d":[41,42,43]}]'''
str2 = '''[{"#ID":"str2_1","a":101},{"#ID":"str2_2","c":301},{"#ID":"str2_3","b":201,"nested":{"a":1001,"b":2001}}]'''
df = spark.createDataFrame([(1, str1,),(2, str2,)],['id', 'json_str'])
This is the actual solution
df_json_str = spark.read.json(df.rdd.map(lambda x:x.json_str))
json_str_schema = T.ArrayType(df_json_str.schema)
df = df.withColumn('json_str', F.from_json('json_str',json_str_schema))
df_json_str.printSchema()
root
|-- #ID: string (nullable = true)
|-- a: long (nullable = true)
|-- b: long (nullable = true)
|-- c: long (nullable = true)
|-- d: array (nullable = true)
| |-- element: long (containsNull = true)
|-- nested: struct (nullable = true)
| |-- a: long (nullable = true)
| |-- b: long (nullable = true)
df.printSchema()
root
|-- id: long (nullable = true)
|-- json_str: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- #ID: string (nullable = true)
| | |-- a: long (nullable = true)
| | |-- b: long (nullable = true)
| | |-- c: long (nullable = true)
| | |-- d: array (nullable = true)
| | | |-- element: long (containsNull = true)
| | |-- nested: struct (nullable = true)
| | | |-- a: long (nullable = true)
| | | |-- b: long (nullable = true)
df.show(truncate=False)
+---+-----------------------------------------------------------------------------------------------------------------------------+
|id |json_str |
+---+-----------------------------------------------------------------------------------------------------------------------------+
|1 |[{str1_1, 11, null, null, null, null}, {str1_2, null, null, null, [41, 42, 43], null}] |
|2 |[{str2_1, 101, null, null, null, null}, {str2_2, null, null, 301, null, null}, {str2_3, null, 201, null, null, {1001, 2001}}]|
+---+-----------------------------------------------------------------------------------------------------------------------------+
A quick demonstration that everything is fine with the result and the dataframe is exploding well
df_explode = df.selectExpr('id', 'inline(json_str)')
df_explode.printSchema()
root
|-- id: long (nullable = true)
|-- #ID: string (nullable = true)
|-- a: long (nullable = true)
|-- b: long (nullable = true)
|-- c: long (nullable = true)
|-- d: array (nullable = true)
| |-- element: long (containsNull = true)
|-- nested: struct (nullable = true)
| |-- a: long (nullable = true)
| |-- b: long (nullable = true)
df_explode.show(truncate=False)
+---+------+----+----+----+------------+------------+
|id |#ID |a |b |c |d |nested |
+---+------+----+----+----+------------+------------+
|1 |str1_1|11 |null|null|null |null |
|1 |str1_2|null|null|null|[41, 42, 43]|null |
|2 |str2_1|101 |null|null|null |null |
|2 |str2_2|null|null|301 |null |null |
|2 |str2_3|null|201 |null|null |{1001, 2001}|
+---+------+----+----+----+------------+------------+
Demonstration on OP sample
import pyspark.sql.functions as F
import pyspark.sql.types as T
This is not part of the solution, only creation of the sample dataframe
str1 = '''[{"#ID":"07D40CB5-273E-4814-B82C-B5DCA7145D20","#ProductName":"Event Registration","#ProductCode":null,"#Type":"Admission Item","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#Status":"Active"},{"CustomFieldDetail":{"#FieldName":"Description","#FieldType":"Open Ended Text - Comment Box","#FieldValue":null,"#FieldId":"C6D46AD1-9B3F-45FF-9331-27EA47811E37"},"#ID":"8EA83E8B-7573-4550-905D-D4320496AD89","#ProductName":"Test","#ProductCode":null,"#Type":"Session","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#StartTime":"2018-01-29T18:00:00","#EndTime":"2018-01-29T19:00:00","#Status":"Active"}]'''
df = spark.createDataFrame([(1, str1,)],['id', 'json_str'])
df_json_str = spark.read.json(df.rdd.map(lambda x:x.json_str))
df_json_str.printSchema()
root
|-- #EndTime: string (nullable = true)
|-- #ID: string (nullable = true)
|-- #ProductCode: string (nullable = true)
|-- #ProductDescription: string (nullable = true)
|-- #ProductName: string (nullable = true)
|-- #SessionCategoryId: string (nullable = true)
|-- #SessionCategoryName: string (nullable = true)
|-- #StartTime: string (nullable = true)
|-- #Status: string (nullable = true)
|-- #Type: string (nullable = true)
|-- CustomFieldDetail: struct (nullable = true)
| |-- #FieldId: string (nullable = true)
| |-- #FieldName: string (nullable = true)
| |-- #FieldType: string (nullable = true)
| |-- #FieldValue: string (nullable = true)
json_str_schema = T.ArrayType(df_json_str.schema)
df = df.withColumn('json_str', F.from_json('json_str',json_str_schema))
df.printSchema()
root
|-- id: long (nullable = true)
|-- json_str: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- #EndTime: string (nullable = true)
| | |-- #ID: string (nullable = true)
| | |-- #ProductCode: string (nullable = true)
| | |-- #ProductDescription: string (nullable = true)
| | |-- #ProductName: string (nullable = true)
| | |-- #SessionCategoryId: string (nullable = true)
| | |-- #SessionCategoryName: string (nullable = true)
| | |-- #StartTime: string (nullable = true)
| | |-- #Status: string (nullable = true)
| | |-- #Type: string (nullable = true)
| | |-- CustomFieldDetail: struct (nullable = true)
| | | |-- #FieldId: string (nullable = true)
| | | |-- #FieldName: string (nullable = true)
| | | |-- #FieldType: string (nullable = true)
| | | |-- #FieldValue: string (nullable = true)
df.show(truncate=False)
+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |json_str |
+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1 |[{null, 07D40CB5-273E-4814-B82C-B5DCA7145D20, null, null, Event Registration, 00000000-0000-0000-0000-000000000000, null, null, Active, Admission Item, null}, {2018-01-29T19:00:00, 8EA83E8B-7573-4550-905D-D4320496AD89, null, null, Test, 00000000-0000-0000-0000-000000000000, null, 2018-01-29T18:00:00, Active, Session, {C6D46AD1-9B3F-45FF-9331-27EA47811E37, Description, Open Ended Text - Comment Box, null}}]|
+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
df.selectExpr('id', 'inline(json_str)').show(truncate=False)
+---+-------------------+------------------------------------+------------+-------------------+------------------+------------------------------------+--------------------+-------------------+-------+--------------+----------------------------------------------------------------------------------------+
|id |#EndTime |#ID |#ProductCode|#ProductDescription|#ProductName |#SessionCategoryId |#SessionCategoryName|#StartTime |#Status|#Type |CustomFieldDetail |
+---+-------------------+------------------------------------+------------+-------------------+------------------+------------------------------------+--------------------+-------------------+-------+--------------+----------------------------------------------------------------------------------------+
|1 |null |07D40CB5-273E-4814-B82C-B5DCA7145D20|null |null |Event Registration|00000000-0000-0000-0000-000000000000|null |null |Active |Admission Item|null |
|1 |2018-01-29T19:00:00|8EA83E8B-7573-4550-905D-D4320496AD89|null |null |Test |00000000-0000-0000-0000-000000000000|null |2018-01-29T18:00:00|Active |Session |{C6D46AD1-9B3F-45FF-9331-27EA47811E37, Description, Open Ended Text - Comment Box, null}|
+---+-------------------+------------------------------------+------------+-------------------+------------------+------------------------------------+--------------------+-------------------+-------+--------------+----------------------------------------------------------------------------------------+
This solution is relevant if you know the schema in advance.
It is off course more effective performance vise than inferring the schema.
import pyspark.sql.functions as F
myval = '''[{"#ID":"07D40CB5-273E-4814-B82C-B5DCA7145D20","#ProductName":"Event Registration","#ProductCode":null,"#Type":"Admission Item","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#Status":"Active"},{"CustomFieldDetail":{"#FieldName":"Description","#FieldType":"Open Ended Text - Comment Box","#FieldValue":null,"#FieldId":"C6D46AD1-9B3F-45FF-9331-27EA47811E37"},"#ID":"8EA83E8B-7573-4550-905D-D4320496AD89","#ProductName":"Test","#ProductCode":null,"#Type":"Session","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#StartTime":"2018-01-29T18:00:00","#EndTime":"2018-01-29T19:00:00","#Status":"Active"}]'''
df = spark.createDataFrame([(1, myval,)], 'id int, myjson string')
myschema = F.schema_of_json(mylist)
# -------------- or -----------------
# myschema = spark.sql(f"select schema_of_json('{mylist}')").first()[0]
# print (myschema)
# ARRAY<STRUCT<`#EndTime`: STRING, `#ID`: STRING, `#ProductCode`: STRING, `#ProductDescription`: STRING, `#ProductName`: STRING, `#SessionCategoryId`: STRING, `#SessionCategoryName`: STRING, `#StartTime`: STRING, `#Status`: STRING, `#Type`: STRING, `CustomFieldDetail`: STRUCT<`#FieldId`: STRING, `#FieldName`: STRING, `#FieldType`: STRING, `#FieldValue`: STRING>>>
# -----------------------------------
df = df.withColumn('myjson', F.from_json('myjson',myschema))
df.show()
+---+--------------------+
| id| myjson|
+---+--------------------+
| 1|[{null, 07D40CB5-...|
+---+--------------------+
df.selectExpr('id', 'inline(myjson)').show()
+---+-------------------+--------------------+------------+-------------------+------------------+--------------------+--------------------+-------------------+-------+--------------+--------------------+
| id| #EndTime| #ID|#ProductCode|#ProductDescription| #ProductName| #SessionCategoryId|#SessionCategoryName| #StartTime|#Status| #Type| CustomFieldDetail|
+---+-------------------+--------------------+------------+-------------------+------------------+--------------------+--------------------+-------------------+-------+--------------+--------------------+
| 1| null|07D40CB5-273E-481...| null| null|Event Registration|00000000-0000-000...| null| null| Active|Admission Item| null|
| 1|2018-01-29T19:00:00|8EA83E8B-7573-455...| null| null| Test|00000000-0000-000...| null|2018-01-29T18:00:00| Active| Session|{C6D46AD1-9B3F-45...|
+---+-------------------+--------------------+------------+-------------------+------------------+--------------------+--------------------+-------------------+-------+--------------+--------------------+

Related

Load only struct from map's value from an avro file into a Spark Dataframe

Using PySpark, I need to load "Properties" object (map's value) from an avro file into its own Spark dataframe. Such that, "Properties" from my avro file will become a dataframe with its elements and values as columns and rows. Hence, struggling to find some clear examples accomplishing that.
Schema of the file:
root
|-- SequenceNumber: long (nullable = true)
|-- Offset: string (nullable = true)
|-- EnqueuedTimeUtc: string (nullable = true)
|-- SystemProperties: map (nullable = true)
| |-- key: string
| |-- value: struct (valueContainsNull = true)
| | |-- member0: long (nullable = true)
| | |-- member1: double (nullable = true)
| | |-- member2: string (nullable = true)
| | |-- member3: binary (nullable = true)
|-- Properties: map (nullable = true)
| |-- key: string
| |-- value: struct (valueContainsNull = true)
| | |-- member0: long (nullable = true)
| | |-- member1: double (nullable = true)
| | |-- member2: string (nullable = true)
| | |-- member3: binary (nullable = true)
|-- Body: binary (nullable = true)
The resulting "Properties" dataframe loaded from the above avro file needs to be like this:
member0
member1
member2
member3
value
value
value
value
map_values is your friend.
Collection function: Returns an unordered array containing the values of the map.
New in version 2.3.0.
df_properties = df.select((F.map_values(F.col('Properties'))[0]).alias('vals')).select('vals.*')
Full example:
df = spark.createDataFrame(
[('a', 20, 4.5, 'r', b'8')],
['key', 'member0', 'member1', 'member2', 'member3'])
df = df.select(F.create_map('key', F.struct('member0', 'member1', 'member2', 'member3')).alias('Properties'))
df.printSchema()
# root
# |-- Properties: map (nullable = false)
# | |-- key: string
# | |-- value: struct (valueContainsNull = false)
# | | |-- member0: long (nullable = true)
# | | |-- member1: double (nullable = true)
# | | |-- member2: string (nullable = true)
# | | |-- member3: binary (nullable = true)
df_properties = df.select((F.map_values(F.col('Properties'))[0]).alias('vals')).select('vals.*')
df_properties.show()
# +-------+-------+-------+-------+
# |member0|member1|member2|member3|
# +-------+-------+-------+-------+
# | 20| 4.5| r| [38]|
# +-------+-------+-------+-------+
df_properties.printSchema()
# root
# |-- member0: long (nullable = true)
# |-- member1: double (nullable = true)
# |-- member2: string (nullable = true)
# |-- member3: binary (nullable = true)

Pyspark: Selecting a value after exploding an array

I am new to pyspark and trying to parse telecom.value if telecom.system = "fax|phone" but getting below error. I understand that filter() would return me a struct and I am selecting a column from that. How do I select the column value after calling filter()?
File "", line 3, in raise_from
pyspark.sql.utils.AnalysisException: Resolved attribute(s) telecom#27,telecom#33 missing from name#3,telecom#5,address#7 in operator !Project [name#3.family AS Practitioner_LastName#23, name#3.suffix AS Practitioner_NameSuffix#24, name#3.given[0] AS Practitioner_FirstName#25, telecom#27.value AS telecom.value#42, telecom#33.value AS telecom.value#43, address#7.city AS PractitionerCity#38, address#7.line[0] AS PractitionerAddress_1#39, address#7.postalCode AS PractitionerZip#40, address#7.state AS PractitionerState#41]. Attribute(s) with the same name appear in the operation: telecom,telecom. Please check if the right attribute(s) are used.
root
|-- resource: struct (nullable = true)
| |-- address: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- city: string (nullable = true)
| | | |-- country: string (nullable = true)
| | | |-- line: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- postalCode: string (nullable = true)
| | | |-- state: string (nullable = true)
| | | |-- use: string (nullable = true)
| |-- id: string (nullable = true)
| |-- identifier: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- type: struct (nullable = true)
| | | | |-- coding: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- code: string (nullable = true)
| | | | | | |-- system: string (nullable = true)
| | | |-- use: string (nullable = true)
| | | |-- value: string (nullable = true)
| |-- name: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- family: string (nullable = true)
| | | |-- given: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- suffix: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- use: string (nullable = true)
| |-- resourceType: string (nullable = true)
| |-- telecom: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- system: string (nullable = true)
| | | |-- use: string (nullable = true)
| | | |-- value: string (nullable = true)
| |-- text: struct (nullable = true)
| | |-- div: string (nullable = true)
| | |-- status: string (nullable = true)
import sys
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
appName = "PySpark Example - JSON file to Spark Data Frame"
master = "local"
spark = SparkSession.builder.appName(appName).master(master).getOrCreate()
json_file_path = 'C:\\Users\\M\\Documents\\Practitioner.json'
source_df = spark.read.json(json_file_path, multiLine=True)
source_df.printSchema()
output = source_df.select(source_df["resource.name"][0].alias("name"),
source_df["resource.telecom"].alias("telecom"),
source_df["resource.address"][0].alias("address"))
output.printSchema()
practitioner = output.select(
output.name.family.alias("Practitioner_LastName"),
output.name.suffix.alias("Practitioner_NameSuffix"),
output.name.given[0].alias("Practitioner_FirstName"),
output.withColumn("telecom", f.explode(f.col("telecom"))).filter(f.col("telecom.system") == "phone").telecom.value,
output.withColumn("telecom", f.explode(f.col("telecom"))).filter(f.col("telecom.system") == "fax").telecom.value,
output.address.city.alias("PractitionerCity"),
output.address.line[0].alias("PractitionerAddress_1"),
output.address.postalCode.alias("PractitionerZip"),
output.address.state.alias("PractitionerState")
)
practitioner.printSchema()
practitioner.show()
My json is:
{"resource":{"resourceType":"Practitioner","id":"scm-ambqa1821624401190","text":{"status":"generated","div":""},"identifier":[{"use":"official","type":{"coding":[{"system":"http:\/\/hl7.org\/fhir\/v2\/0203","code":"NPI"}]},"value":"1548206097"},{"use":"official","type":{"coding":[{"system":"http:\/\/hl7.org\/fhir\/v2\/0203","code":"DEA"}]},"value":"HB1548206"}],"name":[{"use":"official","family":"BERNSTEIN","given":["HELENE","B"],"suffix":["MD"]}],"telecom":[{"system":"phone","value":"6106547854","use":"work"},{"system":"email","value":"sachin.belhekar#allscripts.com","use":"work"},{"system":"fax","value":"7106547895","use":"work"}],"address":[{"use":"work","line":["West Street 1","West Street 2"],"city":"Michigan","state":"MI","postalCode":"49036","country":"USA"}]}}
The data structure is a bit complex, so I will use a UDF to parse it:
import pyspark.sql.functions as f
import pyspark.sql.types as t
#f.udf(t.StringType())
def phone_parser(row):
for item in row:
if item['system'] == 'phone':
return item['value']
#f.udf(t.StringType())
def fax_parser(row):
for item in row:
if item['system'] == 'fax':
return item['value']
output.select(phone_parser('telecom'), fax_parser('telecom'))

Making one dataframe out of two dataframes as separate subcolumns in pyspark

I want to put two data frames into one, so each one is sub column, it's not join of dataframes. So I have two dataframes, stat1_df and stat2_df and they look something like this:
root
|-- max_scenes: integer (nullable = true)
|-- median_scenes: double (nullable = false)
|-- avg_scenes: double (nullable = true)
+----------+-------------+------------------+
|max_scenes|median_scenes|avg_scenes |
+----------+-------------+------------------+
|97 |7.0 |10.806451612903226|
|97 |7.0 |10.806451612903226|
|97 |7.0 |10.806451612903226|
|97 |7.0 |10.806451612903226|
+----------+-------------+------------------+
root
|-- max: double (nullable = true)
|-- type: string (nullable = true)
+-----+-----------+
|max |type |
+-----+-----------+
|10.0 |small |
|25.0 |medium |
|50.0 |large |
|250.0|extra_large|
+-----+-----------+
, and I want the result_df to be as:
root
|-- some_statistics1: struct (nullable = true)
| |-- max_scenes: integer (nullable = true)
|-- median_scenes: double (nullable = false)
|-- avg_scenes: double (nullable = true)
|-- some_statistics2: struct (nullable = true)
| |-- max: double (nullable = true)
|-- type: string (nullable = true)
Is there any way to put those two as shown? stat1_df and stat2_df are simple dataframes, without arrays and nested columns.Final dataframe is written to mongodb. If there any additional questions I am here to answer.
Check below code.
Add id column in both DataFrame, move all columns into struct & then use join both DataFrame's
scala> val dfa = Seq(("10","8.9","7.9")).toDF("max_scenes","median_scenes","avg_scenes")
dfa: org.apache.spark.sql.DataFrame = [max_scenes: string, median_scenes: string ... 1 more field]
scala> dfa.show(false)
+----------+-------------+----------+
|max_scenes|median_scenes|avg_scenes|
+----------+-------------+----------+
|10 |8.9 |7.9 |
+----------+-------------+----------+
scala> dfa.printSchema
root
|-- max_scenes: string (nullable = true)
|-- median_scenes: string (nullable = true)
|-- avg_scenes: string (nullable = true)
scala> val mdfa = dfa.select(struct($"*").as("some_statistics1")).withColumn("id",monotonically_increasing_id)
mdfa: org.apache.spark.sql.DataFrame = [some_statistics1: struct<max_scenes: string, median_scenes: string ... 1 more field>, id: bigint]
scala> mdfa.printSchema
root
|-- some_statistics1: struct (nullable = false)
| |-- max_scenes: string (nullable = true)
| |-- median_scenes: string (nullable = true)
| |-- avg_scenes: string (nullable = true)
|-- id: long (nullable = false)
scala> mdfa.show(false)
+----------------+---+
|some_statistics1|id |
+----------------+---+
|[10,8.9,7.9] |0 |
+----------------+---+
scala> val dfb = Seq(("11.2","sample")).toDF("max","type")
dfb: org.apache.spark.sql.DataFrame = [max: string, type: string]
scala> dfb.printSchema
root
|-- max: string (nullable = true)
|-- type: string (nullable = true)
scala> dfb.show(false)
+----+------+
|max |type |
+----+------+
|11.2|sample|
+----+------+
scala> val mdfb = dfb.select(struct($"*").as("some_statistics2")).withColumn("id",monotonically_increasing_id)
mdfb: org.apache.spark.sql.DataFrame = [some_statistics2: struct<max: string, type: string>, id: bigint]
scala> mdfb.printSchema
root
|-- some_statistics2: struct (nullable = false)
| |-- max: string (nullable = true)
| |-- type: string (nullable = true)
|-- id: long (nullable = false)
scala> mdfb.show(false)
+----------------+---+
|some_statistics2|id |
+----------------+---+
|[11.2,sample] |0 |
+----------------+---+
scala> mdfa.join(mdfb,Seq("id"),"inner").drop("id").printSchema
root
|-- some_statistics1: struct (nullable = false)
| |-- max_scenes: string (nullable = true)
| |-- median_scenes: string (nullable = true)
| |-- avg_scenes: string (nullable = true)
|-- some_statistics2: struct (nullable = false)
| |-- max: string (nullable = true)
| |-- type: string (nullable = true)
scala> mdfa.join(mdfb,Seq("id"),"inner").drop("id").show(false)
+----------------+----------------+
|some_statistics1|some_statistics2|
+----------------+----------------+
|[10,8.9,7.9] |[11.2,sample] |
+----------------+----------------+

Pyspark Dataframe Joins Incorrectly when there are multiple nested fields

I have a data-frame which has schema like this:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
I have a new data frame which has schema like this:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: integer (containsNull = true)
I want to join these data-frames and have the structure like:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: long (containsNull = true)
But in-turn I am getting data frame after join like this:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Country: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: long (containsNull = true)
What should be done?
I have done and outer join on the field docId and the above data frame is the one that I get.
The Dataframe is not 'joined incorrectly', as a JOIN operation is not supposed to sort Structs out. You get seemingly duplicate columns because the JOIN is taking the columns from both dataframes when combining. You have to do the combination explicitly:
Initialization
import pyspark
from pyspark.sql import types as T
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
First, the data (I added only some columns for reference, extending it to your full example is trivial):
Country_schema1 = T.StructField("Country", T.StructType([T.StructField("s1", T.StringType(), nullable=True)]), nullable=True)
Gender_schema1 = T.StructField("Gender", T.StructType([T.StructField("s1", T.StringType(), nullable=True),
T.StructField("s2", T.StringType(), nullable=True)]))
schema1 = T.StructType([T.StructField("docId", T.StringType(), nullable=True),
Country_schema1,
Gender_schema1
])
data1 = [("1",["1"], ["M", "X"])]
df1 = spark.createDataFrame(data1, schema=schema1)
df1.toJSON().collect()
Output:
['{"docId":"1","Country":{"s1":"1"},"Gender":{"s1":"M","s2":"X"}}']
Second dataframe:
Country_schema2 = T.StructField("Country", T.StructType([T.StructField("s6", T.StringType(), nullable=True)]), nullable=True)
Gender_schema2 = T.StructField("Gender", T.StructType([T.StructField("s6", T.StringType(), nullable=True),
T.StructField("s7", T.StringType(), nullable=True)]))
schema2 = T.StructType([T.StructField("docId", T.StringType(), nullable=True),
Country_schema2,
Gender_schema2
])
data2 = [("1",["2"], ["F", "Z"])]
df2 = spark.createDataFrame(data2, schema=schema2)
df2.toJSON().collect()
Output:
['{"docId":"1","Country":{"s6":"2"},"Gender":{"s6":"F","s7":"Z"}}']
Now the logic. I think this is easier if done using SQL. Create the tables first:
df1.createOrReplaceTempView("df1")
df2.createOrReplaceTempView("df2")
This is the query to execute. It basically indicates which fields are to be SELECTed (instead of all of them) and wraps the ones from the StructFields in a new structure which combines them:
result = spark.sql("SELECT df1.docID, "
"STRUCT(df1.Country.s1 AS s1, df2.Country.s6 AS s6) AS Country, "
"STRUCT(df1.Gender.s2 AS s2, df2.Gender.s6 AS s6, df2.Gender.s7 AS s7) AS Gender "
"FROM df1 JOIN df2 ON df1.docID=df2.docID")
result.show()
Output:
+-----+-------+---------+
|docID|Country| Gender|
+-----+-------+---------+
| 1| [1, 2]|[X, F, Z]|
+-----+-------+---------+
It is better viewed in JSON:
result.toJSON().collect()
['{"docID":"1","Country":{"s1":"1","s6":"2"},"Gender":{"s2":"X","s6":"F","s7":"Z"}}']

Parse JSON in Spark containing reserve character

I have a JSON input.txt file with data as follows:
2018-05-30.txt:{"Message":{"eUuid":"6e7d4890-9279-491a-ae4d-70416ef9d42d","schemaVersion":"1.0-AB1","timestamp":1527539376,"id":"XYZ","location":{"dim":{"x":2,"y":-7},"towards":121.0},"source":"a","UniqueId":"test123","code":"del","signature":"xyz","":{},"vel":{"ground":15},"height":{},"next":{"dim":{}},"sub":"del1"}}
2018-05-30.txt:{"Message":{"eUuid":"5e7d4890-9279-491a-ae4d-70416ef9d42d","schemaVersion":"1.0-AB1","timestamp":1627539376,"id":"ABC","location":{"dim":{"x":1,"y":-8},"towards":132.0},"source":"b","UniqueId":"hello123","code":"fra","signature":"abc","":{},"vel":{"ground":16},"height":{},"next":{"dim":{}},"sub":"fra1"}}
.
.
I tried to load the JSON into a DataFrame as follows:
>>val df = spark.read.json("<full path of input.txt file>")
I am receiving
_corrupt_record
dataframe
I am aware that json character contains "." (2018-05-30.txt) as reserve character which is causing the issue. How may I resolve this?
val rdd = sc.textFile("/Users/kishore/abc.json")
val jsonRdd= rdd.map(x=>x.split("txt:")(1))
scala> df.show
+--------------------+
| Message|
+--------------------+
|[test123,del,6e7d...|
|[hello123,fra,5e7...|
+--------------------+
import org.apache.spark.sql.functions._
import sqlContext.implicits._
// val df = sqlContext.read.json(jsonRdd)
// df.show(false)
val df = sqlContext.read.json(jsonRdd).withColumn("eUuid", $"Message"("eUuid"))
.withColumn("schemaVersion", $"Message"("schemaVersion"))
.withColumn("timestamp", $"Message"("timestamp"))
.withColumn("id", $"Message"("id"))
.withColumn("source", $"Message"("source"))
.withColumn("UniqueId", $"Message"("UniqueId"))
.withColumn("location", $"Message"("location"))
.withColumn("dim", $"location"("dim"))
.withColumn("x", $"dim"("x"))
.withColumn("y", $"dim"("y"))
.drop("dim")
.withColumn("vel", $"Message"("vel"))
.withColumn("ground", $"vel"("ground"))
.withColumn("sub", $"Message"("sub"))
.drop("Message")
df.show()
+--------------------+-------------+----------+---+------+--------+------------+---+---+----+------+----+
| eUuid|schemaVersion| timestamp| id|source|UniqueId| location| x| y| vel|ground| sub|
+--------------------+-------------+----------+---+------+--------+------------+---+---+----+------+----+
|6e7d4890-9279-491...| 1.0-AB1|1527539376|XYZ| a| test123|[[2,-7],121]| 2| -7|[15]| 15|del1|
+--------------------+-------------+----------+---+------+--------+------------+---+---+----+------+----+
The problem is not a reserved character it is that the file does not contain valid JSON
so you can
val df=spark.read.textFile(...)
val json=spark.read.json(df.map(v=>v.drop(15)))
json.printSchema()
root
|-- Message: struct (nullable = true)
| |-- UniqueId: string (nullable = true)
| |-- code: string (nullable = true)
| |-- eUuid: string (nullable = true)
| |-- id: string (nullable = true)
| |-- location: struct (nullable = true)
| | |-- dim: struct (nullable = true)
| | | |-- x: long (nullable = true)
| | | |-- y: long (nullable = true)
| | |-- towards: double (nullable = true)
| |-- schemaVersion: string (nullable = true)
| |-- signature: string (nullable = true)
| |-- source: string (nullable = true)
| |-- sub: string (nullable = true)
| |-- timestamp: long (nullable = true)
| |-- vel: struct (nullable = true)
| | |-- ground: long (nullable = true)

Resources