pyspark input read schema for array and struct to read json - apache-spark

I am trying to load some json file to pyspark with only specific columns like below
df = spark.read.json("sample/json/", schema=schema)
So I started writing a input read schema for below main schema
|-- test_name: string (nullable = true)
|-- test_file: string (nullable = true)
|-- test_id: string (nullable = true)
|-- test_type: string (nullable = true)
|-- test_url: string (nullable = true)
|-- test_ids: array (nullable = true)
| |-- element: string (containsNull = true)
|-- value: struct (nullable = true)
| |-- ct: long (nullable = true)
| |-- dimmingSetting: long (nullable = true)
| |-- hue: double (nullable = true)
| |-- modeId: string (nullable = true)
I tried to write for the direct string type but I am not able to write for array and struct type
schema = StructType([
StructField('test_name', StringType()),
StructField('test_file', StringType()),
StructField('test_id', StringType()),
StructField('test_type', StringType()),
StructField('test_url', StringType()),
])
How to extend this schema for
|-- test_ids: array (nullable = true)
|-- value: struct (nullable = true)

the extended version should be
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, LongType, DoubleType
schema = StructType([
StructField('test_name', StringType(), True),
StructField('test_file', StringType(), True),
StructField('test_id', StringType(), True),
StructField('test_type', StringType(), True),
StructField('test_url', StringType(), True),
StructField('test_ids', ArrayType(StringType(), True), True),
StructField('value', StructType([
StructField('ct', LongType(), True),
StructField('dimmingSetting', LongType(), True),
StructField('hue', DoubleType(), True),
StructField('modeId', StringType(), True)
])
)
])
I hope the answer is helpful

Related

Hash of nested (struct) data

Suppose we have the following data:
from pyspark.sql.types import StructType, StructField, StringType
data = [
(("James",None,"Smith"),"OH","M"),
(("Anna","Rose",""),"NY","F"),
(("Julia","","Williams"),"OH","F"),
(("Maria","Anne","Jones"),"NY","M"),
(("Jen","Mary","Brown"),"NY","M"),
(("Mike","Mary","Williams"),"OH","M")
]
schema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('state', StringType(), True),
StructField('gender', StringType(), True)
])
df = spark.createDataFrame(data = data, schema = schema)
with the following schema:
root
|-- name: struct (nullable = true)
| |-- firstname: string (nullable = true)
| |-- middlename: string (nullable = true)
| |-- lastname: string (nullable = true)
|-- state: string (nullable = true)
|-- gender: string (nullable = true)
So the name column looks like this:
+----------------------+
|name |
+----------------------+
|[James,, Smith] |
|[Anna, Rose, ] |
|[Julia, , Williams] |
|[Maria, Anne, Jones] |
|[Jen, Mary, Brown] |
|[Mike, Mary, Williams]|
+----------------------+
Is there an easy way to get the hash value of each of the rows in the name column? Or does hashing only work for unnested data?
In order to create a hash from the struct type column, you first need to convert the struct to e.g. string. to_json does the job. After that you can use a hash function like md5.
F.md5(F.to_json('name'))
Using your example df:
df = df.withColumn('md5', F.md5(F.to_json('name')))
df.show(truncate=0)
# +----------------------+-----+------+--------------------------------+
# |name |state|gender|md5 |
# +----------------------+-----+------+--------------------------------+
# |{James, null, Smith} |OH |M |ad4f22b4a03070026957a65b3b8e5bf9|
# |{Anna, Rose, } |NY |F |c8dcb8f6f52c2e382c33bd92819cd500|
# |{Julia, , Williams} |OH |F |63a7c53d21f53e37b3724312b14a8e97|
# |{Maria, Anne, Jones} |NY |M |a0f2d3962be4941828a2b6f4a02d0ac5|
# |{Jen, Mary, Brown} |NY |M |cae64ee19dd2a0c9745a20e759a527e9|
# |{Mike, Mary, Williams}|OH |M |5e882c033be16bd679f450889e97be6d|
# +----------------------+-----+------+--------------------------------+

Spark Dataframe returns NULL for entire row when one column value of that row is NULL

Input data -
{"driverId":1,"driverRef":"hamilton","number":44,"code":"HAM","name":{"forename":"Lewis","surname":"Hamilton"},"dob":"1985-01-07","nationality":"British","url":"http://en.wikipedia.org/wiki/Lewis_Hamilton"}
{"driverId":2,"driverRef":"heidfeld","number":"\\N","code":"HEI","name":{"forename":"Nick","surname":"Heidfeld"},"dob":"1977-05-10","nationality":"German","url":"http://en.wikipedia.org/wiki/Nick_Heidfeld"}
{"driverId":3,"driverRef":"rosberg","number":6,"code":"ROS","name":{"forename":"Nico","surname":"Rosberg"},"dob":"1985-06-27","nationality":"German","url":"http://en.wikipedia.org/wiki/Nico_Rosberg"}
{"driverId":4,"driverRef":"alonso","number":14,"code":"ALO","name":{"forename":"Fernando","surname":"Alonso"},"dob":"1981-07-29","nationality":"Spanish","url":"http://en.wikipedia.org/wiki/Fernando_Alonso"}
{"driverId":5,"driverRef":"kovalainen","number":"\\N","code":"KOV","name":{"forename":"Heikki","surname":"Kovalainen"},"dob":"1981-10-19","nationality":"Finnish","url":"http://en.wikipedia.org/wiki/Heikki_Kovalainen"}
{"driverId":6,"driverRef":"nakajima","number":"\\N","code":"NAK","name":{"forename":"Kazuki","surname":"Nakajima"},"dob":"1985-01-11","nationality":"Japanese","url":"http://en.wikipedia.org/wiki/Kazuki_Nakajima"}
{"driverId":7,"driverRef":"bourdais","number":"\\N","code":"BOU","name":{"forename":"Sébastien","surname":"Bourdais"},"dob":"1979-02-28","nationality":"French","url":"http://en.wikipedia.org/wiki/S%C3%A9bastien_Bourdais"}
After reading this data into spark dataframe while display that df, I could se entire row for driverId 2,5,6,7 is NULL. I could see column-number value is NULL for that driver id.
Here is my code. Any mistakes here?
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
name_field = StructType(fields =[
StructField("forename", StringType(), True),
StructField("surname", StringType(), True)
])
driver_schema = StructType(fields =[
StructField("driverId", IntegerType(), False),
StructField("driverRef", StringType(), True),
StructField("number", IntegerType(), True),
StructField("code", StringType(), True),
StructField("name", name_field),
StructField("dob", DateType(), True),
StructField("nationality", StringType(),True),
StructField("url", StringType(), True)
])
driver_df = spark.read\
.schema(driver_schema)\
.json('dbfs:/mnt/databrickslearnf1azure/raw/drivers.json')
driver_df.printSchema()
root
|-- driverId: integer (nullable = true)
|-- driverRef: string (nullable = true)
|-- number: integer (nullable = true)
|-- code: string (nullable = true)
|-- name: struct (nullable = true)
| |-- forename: string (nullable = true)
| |-- surname: string (nullable = true)
|-- dob: date (nullable = true)
|-- nationality: string (nullable = true)
|-- url: string (nullable = true)
display(driver_df)
You can change your initial schema to be as follows which assume the number to be of type string.
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
name_field = StructType(fields =[
StructField("forename", StringType(), True),
StructField("surname", StringType(), True)
])
driver_schema = StructType(fields =[
StructField("driverId", IntegerType(), False),
StructField("driverRef", StringType(), True),
StructField("number", StringType(), True),
StructField("code", StringType(), True),
StructField("name", name_field),
StructField("dob", DateType(), True),
StructField("nationality", StringType(),True),
StructField("url", StringType(), True)
])
then you can read the data from the json file using the same code that you are using as follows:
driver_df = spark.read\
.schema(driver_schema)\
.json('dbfs:/mnt/databrickslearnf1azure/raw/drivers.json')
driver_df.printSchema()
Once you have read the data then you can apply the logic to convert "\N" to null and then change the data type of the column from string to integer as below :
from pyspark.sql.functions import *
df = driver_df.withColumn("number", when(driver_df.number=="\\N","null").otherwise(driver_df.number))
finaldf = df.withColumn("number",df.number.cast(IntegerType()))
finaldf.printSchema()
Now if you do the display or show on the dataframe you can see the output as below :
You are seeing this because, according to the official databricks docs: Cause
Spark 3.0 and above (Databricks Runtime
7.3 LTS and above) cannot parse JSON arrays as structs. You should pass the schema as ArrayType instead of StructType.
Solution: Pass the schema as ArrayType instead of StructType.
driver_schema = ArrayType(StructType(fields =[
StructField("driverId", IntegerType(), False),
StructField("driverRef", StringType(), True),
StructField("number", IntegerType(), True),
StructField("code", StringType(), True),
StructField("name", name_field),
StructField("dob", DateType(), True),
StructField("nationality", StringType(),True),
StructField("url", StringType(), True)
]))

spark data frame Schema With Data Definitions

I'm trying to add comments to the field (Schema With Data Definitions), below is the implementation I'm trying.
Tried to with StructType.add() (code in comments) and also with StructType([ StructField("filed",dtype,boolean,metadata )]
got below error. Not sure this implementation works, Can someone help me here, I'm new to spark.
I'm looking for output(Schema With Data Definitions) like
df.printSchema()
root
|-- firstname: string (nullable = true) comments:val1
|-- middlename: string (nullable = true) comments:val2
|-- lastname: string (nullable = true) comments:val3
|-- id: string (nullable = true) comments:val4
|-- gender: string (nullable = true) comments:val5
|-- salary: integer (nullable = true) comments:val6
error:
IllegalArgumentException: Failed to convert the JSON string '{"metadata":"val1","name":"firstname","nullable":true,"type":"string"}' to a field.
Code Which I'm trying to add comments to the field:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
spark = SparkSession.builder.master("local[1]") \
.appName('SparkByExamples.com') \
.getOrCreate()
data = [("James","","Smith","36636","M",3000),
("Michael","Rose","","40288","M",4000),
("Robert","","Williams","42114","M",4000),
("Maria","Anne","Jones","39192","F",4000),
("Jen","Mary","Brown","","F",-1)
]
schema = StructType([ \
StructField("firstname",StringType(),True,'val1'), \
StructField("middlename",StringType(),True,'val2'), \
StructField("lastname",StringType(),True,'val3'), \
StructField("id", StringType(), True,'val4'), \
StructField("gender", StringType(), True,'val5'), \
StructField("salary", IntegerType(), True,'val6') \
])
# schema= StructType().add("firstname",StringType(),True,'val1').add("middlename",StringType(),True,'val2') \
.add("lastname",StringType(),True,'val3').add("id", StringType(), True,'val4').add("gender", StringType(), True,'val5').add("salary", IntegerType(), True,'val6')
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show(truncate=False)
StructField's metadata parameter needs an argument of a dictionary object. It would be something like this
StructField("firstname", StringType(), True, {"comment":"val1"})

Python 3 function to loop over pandas data frame to change schema

I'm converting a bunch of pandas data frames into spark df then writing to hdfs. Also explicitly specifying the schema to change all data types into string to avoid the merge class conflict.
Trying to write a function that will loop through all the pandas df columns, create the schema then I can use the schema to convert to spark.
Here is what I have so far:
def creating_schema(df):
for columnName in df.columns:
schema = StructType([(StructField('"' + columnName + '"' , StringType(), True))])
print(schema)
return(schema)
This outputs:
StructType(List(StructField("column_1",StringType,true)))
StructType(List(StructField("column_2",StringType,true)))
StructType(List(StructField("column_3",StringType,true)))
StructType(List(StructField("column_4",StringType,true)))
StructType(List(StructField("column_5",StringType,true)))
However, I believe I need something in this format for it to work:
schema = StructType([StructField("column_1" , StringType(), True),
StructField("column_2" , StringType(), True),
StructField("column_3" , StringType(), True),
StructField("column_4" , StringType(), True),
StructField("column_5" , StringType(), True)
])
Any help in writing this function would be helpful!
Thanks!
Try:
def creating_schema(df):
sf = []
for columnName in df.columns:
sf.append(StructField(columnName, StringType(), True))
return StructType(sf)
Proof:
pdf = pd.DataFrame(columns=["column_1","column_2","column_3","column_4","column_5"])
schema=creating_schema(pdf)
sdf = sqlContext.createDataFrame(sc.emptyRDD(), schema)
sdf.printSchema()
root
|-- column_1: string (nullable = true)
|-- column_2: string (nullable = true)
|-- column_3: string (nullable = true)
|-- column_4: string (nullable = true)
|-- column_5: string (nullable = true)

create DataFrame of struct PySpark

enter image description hereHow can I create a dataframe of empty structs please.?
Thank you .
dataxx = []
schema = StructType(
[
StructField('Info1',
StructType([
StructField('fld', IntegerType(),True),
StructField('fld1', IntegerType(),True),
StructField('fld2', IntegerType(),True),
StructField('fld3', IntegerType(),True),
StructField('fld4', IntegerType(),True),
])
),
]
)
df = sqlCtx.createDataFrame(dataxx, schema)
Thank you for your help
If you want to create DataFrame that has specific schema but contains no data, you can do it simply by providing empty list to the createDataFrame function:
from pyspark.sql.types import *
schema = StructType(
[
StructField('Info1',
StructType([
StructField('fld', IntegerType(),True),
StructField('fld1', IntegerType(),True),
StructField('fld2', IntegerType(),True),
StructField('fld3', IntegerType(),True),
StructField('fld4', IntegerType(),True),
])
),
]
)
df = spark.createDataFrame([], schema)
df.printSchema()
root
|-- Info1: struct (nullable = true)
| |-- fld: integer (nullable = true)
| |-- fld1: integer (nullable = true)
| |-- fld2: integer (nullable = true)
| |-- fld3: integer (nullable = true)
| |-- fld4: integer (nullable = true)
Here spark is sparkSession.

Resources