In my pyspark dataframe, there is a column which is being inferred as string, but it is actually a List. I need to convert the same into list
Sample Value of the column :
[{"#ID":"07D40CB5-273E-4814-B82C-B5DCA7145D20","#ProductName":"Event Registration","#ProductCode":null,"#Type":"Admission Item","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#Status":"Active"},{"CustomFieldDetail":{"#FieldName":"Description","#FieldType":"Open Ended Text - Comment Box","#FieldValue":null,"#FieldId":"C6D46AD1-9B3F-45FF-9331-27EA47811E37"},"#ID":"8EA83E8B-7573-4550-905D-D4320496AD89","#ProductName":"Test","#ProductCode":null,"#Type":"Session","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#StartTime":"2018-01-29T18:00:00","#EndTime":"2018-01-29T19:00:00","#Status":"Active"}]
Please let me know how to achieve this
This is a dynamic solution that first converts the JSON strings column to an RDD, then read the RDD to a dataframe and finally makes use of the inferred schema to read the original dataframe while parsing the JSON strings.
import pyspark.sql.functions as F
import pyspark.sql.types as T
This is not part of the solution, only creation of the sample dataframe
str1 = '''[{"#ID":"str1_1","a":11},{"#ID":"str1_2","d":[41,42,43]}]'''
str2 = '''[{"#ID":"str2_1","a":101},{"#ID":"str2_2","c":301},{"#ID":"str2_3","b":201,"nested":{"a":1001,"b":2001}}]'''
df = spark.createDataFrame([(1, str1,),(2, str2,)],['id', 'json_str'])
This is the actual solution
df_json_str = spark.read.json(df.rdd.map(lambda x:x.json_str))
json_str_schema = T.ArrayType(df_json_str.schema)
df = df.withColumn('json_str', F.from_json('json_str',json_str_schema))
df_json_str.printSchema()
root
|-- #ID: string (nullable = true)
|-- a: long (nullable = true)
|-- b: long (nullable = true)
|-- c: long (nullable = true)
|-- d: array (nullable = true)
| |-- element: long (containsNull = true)
|-- nested: struct (nullable = true)
| |-- a: long (nullable = true)
| |-- b: long (nullable = true)
df.printSchema()
root
|-- id: long (nullable = true)
|-- json_str: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- #ID: string (nullable = true)
| | |-- a: long (nullable = true)
| | |-- b: long (nullable = true)
| | |-- c: long (nullable = true)
| | |-- d: array (nullable = true)
| | | |-- element: long (containsNull = true)
| | |-- nested: struct (nullable = true)
| | | |-- a: long (nullable = true)
| | | |-- b: long (nullable = true)
df.show(truncate=False)
+---+-----------------------------------------------------------------------------------------------------------------------------+
|id |json_str |
+---+-----------------------------------------------------------------------------------------------------------------------------+
|1 |[{str1_1, 11, null, null, null, null}, {str1_2, null, null, null, [41, 42, 43], null}] |
|2 |[{str2_1, 101, null, null, null, null}, {str2_2, null, null, 301, null, null}, {str2_3, null, 201, null, null, {1001, 2001}}]|
+---+-----------------------------------------------------------------------------------------------------------------------------+
A quick demonstration that everything is fine with the result and the dataframe is exploding well
df_explode = df.selectExpr('id', 'inline(json_str)')
df_explode.printSchema()
root
|-- id: long (nullable = true)
|-- #ID: string (nullable = true)
|-- a: long (nullable = true)
|-- b: long (nullable = true)
|-- c: long (nullable = true)
|-- d: array (nullable = true)
| |-- element: long (containsNull = true)
|-- nested: struct (nullable = true)
| |-- a: long (nullable = true)
| |-- b: long (nullable = true)
df_explode.show(truncate=False)
+---+------+----+----+----+------------+------------+
|id |#ID |a |b |c |d |nested |
+---+------+----+----+----+------------+------------+
|1 |str1_1|11 |null|null|null |null |
|1 |str1_2|null|null|null|[41, 42, 43]|null |
|2 |str2_1|101 |null|null|null |null |
|2 |str2_2|null|null|301 |null |null |
|2 |str2_3|null|201 |null|null |{1001, 2001}|
+---+------+----+----+----+------------+------------+
Demonstration on OP sample
import pyspark.sql.functions as F
import pyspark.sql.types as T
This is not part of the solution, only creation of the sample dataframe
str1 = '''[{"#ID":"07D40CB5-273E-4814-B82C-B5DCA7145D20","#ProductName":"Event Registration","#ProductCode":null,"#Type":"Admission Item","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#Status":"Active"},{"CustomFieldDetail":{"#FieldName":"Description","#FieldType":"Open Ended Text - Comment Box","#FieldValue":null,"#FieldId":"C6D46AD1-9B3F-45FF-9331-27EA47811E37"},"#ID":"8EA83E8B-7573-4550-905D-D4320496AD89","#ProductName":"Test","#ProductCode":null,"#Type":"Session","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#StartTime":"2018-01-29T18:00:00","#EndTime":"2018-01-29T19:00:00","#Status":"Active"}]'''
df = spark.createDataFrame([(1, str1,)],['id', 'json_str'])
df_json_str = spark.read.json(df.rdd.map(lambda x:x.json_str))
df_json_str.printSchema()
root
|-- #EndTime: string (nullable = true)
|-- #ID: string (nullable = true)
|-- #ProductCode: string (nullable = true)
|-- #ProductDescription: string (nullable = true)
|-- #ProductName: string (nullable = true)
|-- #SessionCategoryId: string (nullable = true)
|-- #SessionCategoryName: string (nullable = true)
|-- #StartTime: string (nullable = true)
|-- #Status: string (nullable = true)
|-- #Type: string (nullable = true)
|-- CustomFieldDetail: struct (nullable = true)
| |-- #FieldId: string (nullable = true)
| |-- #FieldName: string (nullable = true)
| |-- #FieldType: string (nullable = true)
| |-- #FieldValue: string (nullable = true)
json_str_schema = T.ArrayType(df_json_str.schema)
df = df.withColumn('json_str', F.from_json('json_str',json_str_schema))
df.printSchema()
root
|-- id: long (nullable = true)
|-- json_str: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- #EndTime: string (nullable = true)
| | |-- #ID: string (nullable = true)
| | |-- #ProductCode: string (nullable = true)
| | |-- #ProductDescription: string (nullable = true)
| | |-- #ProductName: string (nullable = true)
| | |-- #SessionCategoryId: string (nullable = true)
| | |-- #SessionCategoryName: string (nullable = true)
| | |-- #StartTime: string (nullable = true)
| | |-- #Status: string (nullable = true)
| | |-- #Type: string (nullable = true)
| | |-- CustomFieldDetail: struct (nullable = true)
| | | |-- #FieldId: string (nullable = true)
| | | |-- #FieldName: string (nullable = true)
| | | |-- #FieldType: string (nullable = true)
| | | |-- #FieldValue: string (nullable = true)
df.show(truncate=False)
+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |json_str |
+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1 |[{null, 07D40CB5-273E-4814-B82C-B5DCA7145D20, null, null, Event Registration, 00000000-0000-0000-0000-000000000000, null, null, Active, Admission Item, null}, {2018-01-29T19:00:00, 8EA83E8B-7573-4550-905D-D4320496AD89, null, null, Test, 00000000-0000-0000-0000-000000000000, null, 2018-01-29T18:00:00, Active, Session, {C6D46AD1-9B3F-45FF-9331-27EA47811E37, Description, Open Ended Text - Comment Box, null}}]|
+---+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
df.selectExpr('id', 'inline(json_str)').show(truncate=False)
+---+-------------------+------------------------------------+------------+-------------------+------------------+------------------------------------+--------------------+-------------------+-------+--------------+----------------------------------------------------------------------------------------+
|id |#EndTime |#ID |#ProductCode|#ProductDescription|#ProductName |#SessionCategoryId |#SessionCategoryName|#StartTime |#Status|#Type |CustomFieldDetail |
+---+-------------------+------------------------------------+------------+-------------------+------------------+------------------------------------+--------------------+-------------------+-------+--------------+----------------------------------------------------------------------------------------+
|1 |null |07D40CB5-273E-4814-B82C-B5DCA7145D20|null |null |Event Registration|00000000-0000-0000-0000-000000000000|null |null |Active |Admission Item|null |
|1 |2018-01-29T19:00:00|8EA83E8B-7573-4550-905D-D4320496AD89|null |null |Test |00000000-0000-0000-0000-000000000000|null |2018-01-29T18:00:00|Active |Session |{C6D46AD1-9B3F-45FF-9331-27EA47811E37, Description, Open Ended Text - Comment Box, null}|
+---+-------------------+------------------------------------+------------+-------------------+------------------+------------------------------------+--------------------+-------------------+-------+--------------+----------------------------------------------------------------------------------------+
This solution is relevant if you know the schema in advance.
It is off course more effective performance vise than inferring the schema.
import pyspark.sql.functions as F
myval = '''[{"#ID":"07D40CB5-273E-4814-B82C-B5DCA7145D20","#ProductName":"Event Registration","#ProductCode":null,"#Type":"Admission Item","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#Status":"Active"},{"CustomFieldDetail":{"#FieldName":"Description","#FieldType":"Open Ended Text - Comment Box","#FieldValue":null,"#FieldId":"C6D46AD1-9B3F-45FF-9331-27EA47811E37"},"#ID":"8EA83E8B-7573-4550-905D-D4320496AD89","#ProductName":"Test","#ProductCode":null,"#Type":"Session","#ProductDescription":null,"#SessionCategoryName":null,"#SessionCategoryId":"00000000-0000-0000-0000-000000000000","#StartTime":"2018-01-29T18:00:00","#EndTime":"2018-01-29T19:00:00","#Status":"Active"}]'''
df = spark.createDataFrame([(1, myval,)], 'id int, myjson string')
myschema = F.schema_of_json(mylist)
# -------------- or -----------------
# myschema = spark.sql(f"select schema_of_json('{mylist}')").first()[0]
# print (myschema)
# ARRAY<STRUCT<`#EndTime`: STRING, `#ID`: STRING, `#ProductCode`: STRING, `#ProductDescription`: STRING, `#ProductName`: STRING, `#SessionCategoryId`: STRING, `#SessionCategoryName`: STRING, `#StartTime`: STRING, `#Status`: STRING, `#Type`: STRING, `CustomFieldDetail`: STRUCT<`#FieldId`: STRING, `#FieldName`: STRING, `#FieldType`: STRING, `#FieldValue`: STRING>>>
# -----------------------------------
df = df.withColumn('myjson', F.from_json('myjson',myschema))
df.show()
+---+--------------------+
| id| myjson|
+---+--------------------+
| 1|[{null, 07D40CB5-...|
+---+--------------------+
df.selectExpr('id', 'inline(myjson)').show()
+---+-------------------+--------------------+------------+-------------------+------------------+--------------------+--------------------+-------------------+-------+--------------+--------------------+
| id| #EndTime| #ID|#ProductCode|#ProductDescription| #ProductName| #SessionCategoryId|#SessionCategoryName| #StartTime|#Status| #Type| CustomFieldDetail|
+---+-------------------+--------------------+------------+-------------------+------------------+--------------------+--------------------+-------------------+-------+--------------+--------------------+
| 1| null|07D40CB5-273E-481...| null| null|Event Registration|00000000-0000-000...| null| null| Active|Admission Item| null|
| 1|2018-01-29T19:00:00|8EA83E8B-7573-455...| null| null| Test|00000000-0000-000...| null|2018-01-29T18:00:00| Active| Session|{C6D46AD1-9B3F-45...|
+---+-------------------+--------------------+------------+-------------------+------------------+--------------------+--------------------+-------------------+-------+--------------+--------------------+
I have a map field in dataset with below schema
|-- party: map (nullable = true)
| |-- key: string
| |-- value: struct (valueContainsNull = true)
| | |-- partyName: string (nullable = true)
| | |-- cdrId: string (nullable = true)
| | |-- legalEntityId: string (nullable = true)
| | |-- customPartyId: string (nullable = true)
| | |-- partyIdScheme: string (nullable = true)
| | |-- customPartyIdScheme: string (nullable = true)
| | |-- bdrId: string (nullable = true)
Need to convert it to JSON type. Please suggest how to do it. Thanks in advance
Spark provides to_json function available for DataFrame operations:
import org.apache.spark.sql.functions._
import spark.implicits._
val df =
List(
("key1", "party01", "cdrId01"),
("key2", "party02", "cdrId02"),
)
.toDF("key", "partyName", "cdrId")
.select(struct($"key", struct($"partyName", $"cdrId")).as("col1"))
.agg(map_from_entries(collect_set($"col1")).as("map_col"))
.select($"map_col", to_json($"map_col").as("json_col"))
I have a data-frame which has schema like this:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
I have a new data frame which has schema like this:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = false)
| |-- s6: array (nullable = true)
| | |-- element: integer (containsNull = true)
I want to join these data-frames and have the structure like:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: long (containsNull = true)
But in-turn I am getting data frame after join like this:
root
|-- docId: string (nullable = true)
|-- Country: struct (nullable = true)
| |-- s1: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Country: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s1: string (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- s5: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- Gender: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: string (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s1: long (nullable = true)
| |-- s2: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s3: array (nullable = true)
| | |-- element: long (containsNull = true)
| |-- s4: array (nullable = true)
| | |-- element: long (containsNull = true)
|-- YOB: struct (nullable = true)
| |-- s6: array (nullable = true)
| | |-- element: long (containsNull = true)
What should be done?
I have done and outer join on the field docId and the above data frame is the one that I get.
The Dataframe is not 'joined incorrectly', as a JOIN operation is not supposed to sort Structs out. You get seemingly duplicate columns because the JOIN is taking the columns from both dataframes when combining. You have to do the combination explicitly:
Initialization
import pyspark
from pyspark.sql import types as T
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
First, the data (I added only some columns for reference, extending it to your full example is trivial):
Country_schema1 = T.StructField("Country", T.StructType([T.StructField("s1", T.StringType(), nullable=True)]), nullable=True)
Gender_schema1 = T.StructField("Gender", T.StructType([T.StructField("s1", T.StringType(), nullable=True),
T.StructField("s2", T.StringType(), nullable=True)]))
schema1 = T.StructType([T.StructField("docId", T.StringType(), nullable=True),
Country_schema1,
Gender_schema1
])
data1 = [("1",["1"], ["M", "X"])]
df1 = spark.createDataFrame(data1, schema=schema1)
df1.toJSON().collect()
Output:
['{"docId":"1","Country":{"s1":"1"},"Gender":{"s1":"M","s2":"X"}}']
Second dataframe:
Country_schema2 = T.StructField("Country", T.StructType([T.StructField("s6", T.StringType(), nullable=True)]), nullable=True)
Gender_schema2 = T.StructField("Gender", T.StructType([T.StructField("s6", T.StringType(), nullable=True),
T.StructField("s7", T.StringType(), nullable=True)]))
schema2 = T.StructType([T.StructField("docId", T.StringType(), nullable=True),
Country_schema2,
Gender_schema2
])
data2 = [("1",["2"], ["F", "Z"])]
df2 = spark.createDataFrame(data2, schema=schema2)
df2.toJSON().collect()
Output:
['{"docId":"1","Country":{"s6":"2"},"Gender":{"s6":"F","s7":"Z"}}']
Now the logic. I think this is easier if done using SQL. Create the tables first:
df1.createOrReplaceTempView("df1")
df2.createOrReplaceTempView("df2")
This is the query to execute. It basically indicates which fields are to be SELECTed (instead of all of them) and wraps the ones from the StructFields in a new structure which combines them:
result = spark.sql("SELECT df1.docID, "
"STRUCT(df1.Country.s1 AS s1, df2.Country.s6 AS s6) AS Country, "
"STRUCT(df1.Gender.s2 AS s2, df2.Gender.s6 AS s6, df2.Gender.s7 AS s7) AS Gender "
"FROM df1 JOIN df2 ON df1.docID=df2.docID")
result.show()
Output:
+-----+-------+---------+
|docID|Country| Gender|
+-----+-------+---------+
| 1| [1, 2]|[X, F, Z]|
+-----+-------+---------+
It is better viewed in JSON:
result.toJSON().collect()
['{"docID":"1","Country":{"s1":"1","s6":"2"},"Gender":{"s2":"X","s6":"F","s7":"Z"}}']
I have a JSON input.txt file with data as follows:
2018-05-30.txt:{"Message":{"eUuid":"6e7d4890-9279-491a-ae4d-70416ef9d42d","schemaVersion":"1.0-AB1","timestamp":1527539376,"id":"XYZ","location":{"dim":{"x":2,"y":-7},"towards":121.0},"source":"a","UniqueId":"test123","code":"del","signature":"xyz","":{},"vel":{"ground":15},"height":{},"next":{"dim":{}},"sub":"del1"}}
2018-05-30.txt:{"Message":{"eUuid":"5e7d4890-9279-491a-ae4d-70416ef9d42d","schemaVersion":"1.0-AB1","timestamp":1627539376,"id":"ABC","location":{"dim":{"x":1,"y":-8},"towards":132.0},"source":"b","UniqueId":"hello123","code":"fra","signature":"abc","":{},"vel":{"ground":16},"height":{},"next":{"dim":{}},"sub":"fra1"}}
.
.
I tried to load the JSON into a DataFrame as follows:
>>val df = spark.read.json("<full path of input.txt file>")
I am receiving
_corrupt_record
dataframe
I am aware that json character contains "." (2018-05-30.txt) as reserve character which is causing the issue. How may I resolve this?
val rdd = sc.textFile("/Users/kishore/abc.json")
val jsonRdd= rdd.map(x=>x.split("txt:")(1))
scala> df.show
+--------------------+
| Message|
+--------------------+
|[test123,del,6e7d...|
|[hello123,fra,5e7...|
+--------------------+
import org.apache.spark.sql.functions._
import sqlContext.implicits._
// val df = sqlContext.read.json(jsonRdd)
// df.show(false)
val df = sqlContext.read.json(jsonRdd).withColumn("eUuid", $"Message"("eUuid"))
.withColumn("schemaVersion", $"Message"("schemaVersion"))
.withColumn("timestamp", $"Message"("timestamp"))
.withColumn("id", $"Message"("id"))
.withColumn("source", $"Message"("source"))
.withColumn("UniqueId", $"Message"("UniqueId"))
.withColumn("location", $"Message"("location"))
.withColumn("dim", $"location"("dim"))
.withColumn("x", $"dim"("x"))
.withColumn("y", $"dim"("y"))
.drop("dim")
.withColumn("vel", $"Message"("vel"))
.withColumn("ground", $"vel"("ground"))
.withColumn("sub", $"Message"("sub"))
.drop("Message")
df.show()
+--------------------+-------------+----------+---+------+--------+------------+---+---+----+------+----+
| eUuid|schemaVersion| timestamp| id|source|UniqueId| location| x| y| vel|ground| sub|
+--------------------+-------------+----------+---+------+--------+------------+---+---+----+------+----+
|6e7d4890-9279-491...| 1.0-AB1|1527539376|XYZ| a| test123|[[2,-7],121]| 2| -7|[15]| 15|del1|
+--------------------+-------------+----------+---+------+--------+------------+---+---+----+------+----+
The problem is not a reserved character it is that the file does not contain valid JSON
so you can
val df=spark.read.textFile(...)
val json=spark.read.json(df.map(v=>v.drop(15)))
json.printSchema()
root
|-- Message: struct (nullable = true)
| |-- UniqueId: string (nullable = true)
| |-- code: string (nullable = true)
| |-- eUuid: string (nullable = true)
| |-- id: string (nullable = true)
| |-- location: struct (nullable = true)
| | |-- dim: struct (nullable = true)
| | | |-- x: long (nullable = true)
| | | |-- y: long (nullable = true)
| | |-- towards: double (nullable = true)
| |-- schemaVersion: string (nullable = true)
| |-- signature: string (nullable = true)
| |-- source: string (nullable = true)
| |-- sub: string (nullable = true)
| |-- timestamp: long (nullable = true)
| |-- vel: struct (nullable = true)
| | |-- ground: long (nullable = true)