create DataFrame of struct PySpark - python-3.x

enter image description hereHow can I create a dataframe of empty structs please.?
Thank you .
dataxx = []
schema = StructType(
[
StructField('Info1',
StructType([
StructField('fld', IntegerType(),True),
StructField('fld1', IntegerType(),True),
StructField('fld2', IntegerType(),True),
StructField('fld3', IntegerType(),True),
StructField('fld4', IntegerType(),True),
])
),
]
)
df = sqlCtx.createDataFrame(dataxx, schema)
Thank you for your help

If you want to create DataFrame that has specific schema but contains no data, you can do it simply by providing empty list to the createDataFrame function:
from pyspark.sql.types import *
schema = StructType(
[
StructField('Info1',
StructType([
StructField('fld', IntegerType(),True),
StructField('fld1', IntegerType(),True),
StructField('fld2', IntegerType(),True),
StructField('fld3', IntegerType(),True),
StructField('fld4', IntegerType(),True),
])
),
]
)
df = spark.createDataFrame([], schema)
df.printSchema()
root
|-- Info1: struct (nullable = true)
| |-- fld: integer (nullable = true)
| |-- fld1: integer (nullable = true)
| |-- fld2: integer (nullable = true)
| |-- fld3: integer (nullable = true)
| |-- fld4: integer (nullable = true)
Here spark is sparkSession.

Related

Hash of nested (struct) data

Suppose we have the following data:
from pyspark.sql.types import StructType, StructField, StringType
data = [
(("James",None,"Smith"),"OH","M"),
(("Anna","Rose",""),"NY","F"),
(("Julia","","Williams"),"OH","F"),
(("Maria","Anne","Jones"),"NY","M"),
(("Jen","Mary","Brown"),"NY","M"),
(("Mike","Mary","Williams"),"OH","M")
]
schema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('state', StringType(), True),
StructField('gender', StringType(), True)
])
df = spark.createDataFrame(data = data, schema = schema)
with the following schema:
root
|-- name: struct (nullable = true)
| |-- firstname: string (nullable = true)
| |-- middlename: string (nullable = true)
| |-- lastname: string (nullable = true)
|-- state: string (nullable = true)
|-- gender: string (nullable = true)
So the name column looks like this:
+----------------------+
|name |
+----------------------+
|[James,, Smith] |
|[Anna, Rose, ] |
|[Julia, , Williams] |
|[Maria, Anne, Jones] |
|[Jen, Mary, Brown] |
|[Mike, Mary, Williams]|
+----------------------+
Is there an easy way to get the hash value of each of the rows in the name column? Or does hashing only work for unnested data?
In order to create a hash from the struct type column, you first need to convert the struct to e.g. string. to_json does the job. After that you can use a hash function like md5.
F.md5(F.to_json('name'))
Using your example df:
df = df.withColumn('md5', F.md5(F.to_json('name')))
df.show(truncate=0)
# +----------------------+-----+------+--------------------------------+
# |name |state|gender|md5 |
# +----------------------+-----+------+--------------------------------+
# |{James, null, Smith} |OH |M |ad4f22b4a03070026957a65b3b8e5bf9|
# |{Anna, Rose, } |NY |F |c8dcb8f6f52c2e382c33bd92819cd500|
# |{Julia, , Williams} |OH |F |63a7c53d21f53e37b3724312b14a8e97|
# |{Maria, Anne, Jones} |NY |M |a0f2d3962be4941828a2b6f4a02d0ac5|
# |{Jen, Mary, Brown} |NY |M |cae64ee19dd2a0c9745a20e759a527e9|
# |{Mike, Mary, Williams}|OH |M |5e882c033be16bd679f450889e97be6d|
# +----------------------+-----+------+--------------------------------+

Pyspark 'from_json', data frame return null for the all json values

I have below logs which contains text and json string
2020-09-24T08:03:01.633Z 11.21.23.1 {"EventTime":"2020-09-24 13:33:01","Hostname":"abc-cde.india.local","Keywords":-1234}
created DF for the above logs as seen below
| Date |Source IP | Event Type
|2020-09-24|11.21.23.1 | {"EventTime":"202|
crated schema for converting json string to another data frame
json_schema = StructType([
StructField("EventTime", StringType()),
StructField("Hostname", StringType()),
StructField("Keywords", IntegerType())
])
json_converted_df= df.select(F.from_json(F.col('Event Type'), json_schema).alias("data")).select("data.*").show()
but the Data Frame rerun null for all new json schema
+---------+--------+--------
|EventTime|Hostname|Keywords|
+---------+--------+--------
| null| null|null |
+---------+--------+--------
How to resolve this issue?
Works fine with me ...
# Preparation of test dataset
a = [
(
"2020-09-24T08:03:01.633Z",
"11.21.23.1",
'{"EventTime":"2020-09-24 13:33:01","Hostname":"abc-cde.india.local","Keywords":-1234}',
),
]
b = ["Date", "Source IP", "Event Type"]
df = spark.createDataFrame(a, b)
df.show()
#+--------------------+----------+--------------------+
#| Date| Source IP| Event Type|
#+--------------------+----------+--------------------+
#|2020-09-24T08:03:...|11.21.23.1|{"EventTime":"202...|
#+--------------------+----------+--------------------+
df.printSchema()
#root
# |-- Date: string (nullable = true)
# |-- Source IP: string (nullable = true)
# |-- Event Type: string (nullable = true)
# Your code executed
from pyspark.sql.types import *
json_schema = StructType(
[
StructField("EventTime", StringType()),
StructField("Hostname", StringType()),
StructField("Keywords", IntegerType()),
]
)
json_converted_df = df.select(
F.from_json(F.col("Event Type"), json_schema).alias("data")
).select("data.*")
json_converted_df.show()
#+-------------------+-------------------+--------+
#| EventTime| Hostname|Keywords|
#+-------------------+-------------------+--------+
#|2020-09-24 13:33:01|abc-cde.india.local| -1234|
#+-------------------+-------------------+--------+
json_converted_df.printSchema()
#root
# |-- EventTime: string (nullable = true)
# |-- Hostname: string (nullable = true)
# |-- Keywords: integer (nullable = true)

Pyspark dataframe write and read changes schema

I have a spark dataframe which contains both string and int columns.
But when I write the dataframe to a csv file and then load it later, the all the columns are loaded as string.
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
df = spark.createDataFrame([("Alberto", 2), ("Dakota", 2)],
["Name", "count"])
Before:
df.printSchema()
Output:
root
|-- Name: string (nullable = true)
|-- count: long (nullable = true)
df.write.mode('overwrite').option('header', True).csv(filepath)
new_df = spark.read.option('header', True).csv(filepath)
After:
new_df.printSchema()
Output:
root
|-- Name: string (nullable = true)
|-- count: string (nullable = true)
How do I specify to store the schema as well while writing?
We don't have to specify schema while writing but we can specify the schema while reading.
Example:
from pyspark.sql.types import *
from pyspark.sql.functions import *
schema = StructType(
[
StructField('Name', StringType(), True),
StructField('count', LongType(), True)
]
)
#specify schema while reading
new_df = spark.read.schema(schema).option('header', True).csv(filepath)
new_df.printSchema()
#or else use inferschema option as true but specifying schema will be more robust
new_df = spark.read.option('header', True).option("inferSchema",True).csv(filepath)

using clause where within nested structure

I'm creating a Dataframe of struct.
I want to create another 2 structs depending on the value of my field x2.field3 The idea is if x2.field3==4 Then my struct will be created("struct_1"), if x2.field3==3 Then my struct will be created("struct_2")
when(col("x2.field3").cast(IntegerType())== lit(4), struct(col("x1.field1").alias("index2")).alias("struct_1"))\
.when(col("x2.field3").cast(IntegerType())==lit(3), struct(col("x1.field1").alias("Index1")).alias("struct_2"))
I tried different solutions and didn't succeed because I have always the same error :
Py4JJavaError: An error occurred while calling o21058.withColumn. :
org.apache.spark.sql.AnalysisException: cannot resolve 'CASE WHEN
(CAST(x2.field3 AS INT) = 4) THEN named_struct('index2',
x1.field1) WHEN (CAST(x2.field3 AS INT) = 3) THEN
named_struct('Index1', x1.field1) END' due to data type mismatch:
THEN and ELSE expressions should all be same type or coercible to a
common type;; 'Project [x1#5751, x2#5752, named_struct(gen1,
x1#5751.field1, gen2, x1#5751.field1, NamePlaceholder,
named_struct(gen3.1, x1#5751.field1, gen3.2, x1#5751.field1, gen3.3,
x1#5751.field1, gen3.4, x1#5751.field1, gen3.5, x1#5751.field1,
gen3.6, x1#5751.field1, NamePlaceholder, named_struct(gen3.7.1,
named_struct(gen3.7.1.1, 11, gen3.7.1.2, 40), col2, CASE WHEN
(cast(x2#5752.field3 as int) = 4) THEN named_struct(index2,
x1#5751.field1) WHEN (cast(x2#5752.field3 as int) = 3) THEN
named_struct(Index1, x1#5751.field1) END))) AS General#5772]
+- LogicalRDD [x1#5751, x2#5752], false
My entire code is below
schema = StructType(
[
StructField('x1',
StructType([
StructField('field1', IntegerType(),True),
StructField('field2', IntegerType(),True),
StructField('x12',
StructType([
StructField('field5', IntegerType(),True)
])
),
])
),
StructField('x2',
StructType([
StructField('field3', IntegerType(),True),
StructField('field4', BooleanType(),True)
])
)
])
df1 = sqlCtx.createDataFrame([Row(Row(1, 3, Row(23)), Row(3,True))], schema)
df1.printSchema()
df = df1.withColumn("General",
struct(
col("x1.field1").alias("gen1"),
col("x1.field1").alias("gen2"),
struct(col("x1.field1").alias("gen3.1"),
col("x1.field1").alias("gen3.2"),
col("x1.field1").alias("gen3.3"),
col("x1.field1").alias("gen3.4"),
col("x1.field1").alias("gen3.5"),
col("x1.field1").alias("gen3.6"),
struct(struct(lit(11).alias("gen3.7.1.1"),
lit(40).alias("gen3.7.1.2")).alias("gen3.7.1"),
when(col("x2.field3").cast(IntegerType())== lit(4), struct(col("x1.field1").alias("index2")).alias("struct_1"))\
.when(col("x2.field3").cast(IntegerType())==lit(3), struct(col("x1.field1").alias("Index1")).alias("struct_2"))
).alias("gen3.7")).alias("gen3")
)).drop('x1','x2')
df.printSchema()
Over, struct_1 and struct_2 are exclusive, so I recommend you the following piece of code :
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
schema = StructType(
[
StructField('x1',
StructType([
StructField('field1', IntegerType(),True),
StructField('field2', IntegerType(),True),
StructField('x12',
StructType([
StructField('field5', IntegerType(),True)
])
),
])
),
StructField('x2',
StructType([
StructField('field3', IntegerType(),True),
StructField('field4', BooleanType(),True)
])
)
])
df1 = sqlCtx.createDataFrame([Row(Row(1, 3, Row(23)), Row(1,True))], schema)
df = df1.withColumn("General",
struct(
col("x1.field1").alias("gen1"),
col("x1.field1").alias("gen2"),
struct(col("x1.field1").alias("gen3.1"),
col("x1.field1").alias("gen3.2"),
col("x1.field1").alias("gen3.3"),
col("x1.field1").alias("gen3.4"),
col("x1.field1").alias("gen3.5"),
col("x1.field1").alias("gen3.6"),
struct(struct(lit(11).alias("gen3.7.1.1"),
lit(40).alias("gen3.7.1.2")).alias("gen3.7.1"),
when(col("x2.field3").cast(IntegerType())== lit(4), struct(col("x1.field1").alias("index2"))).alias("struct_1"),
when(col("x2.field3").cast(IntegerType())==lit(3), struct(col("x1.field1").alias("Index1"))).alias("struct_2")
).alias("gen3.7")).alias("gen3")
)).drop('x1','x2')
df.printSchema()
Output :
root
|-- General: struct (nullable = false)
| |-- gen1: integer (nullable = true)
| |-- gen2: integer (nullable = true)
| |-- gen3: struct (nullable = false)
| | |-- gen3.1: integer (nullable = true)
| | |-- gen3.2: integer (nullable = true)
| | |-- gen3.3: integer (nullable = true)
| | |-- gen3.4: integer (nullable = true)
| | |-- gen3.5: integer (nullable = true)
| | |-- gen3.6: integer (nullable = true)
| | |-- gen3.7: struct (nullable = false)
| | | |-- gen3.7.1: struct (nullable = false)
| | | | |-- gen3.7.1.1: integer (nullable = false)
| | | | |-- gen3.7.1.2: integer (nullable = false)
| | | |-- struct_1: struct (nullable = true)
| | | | |-- index2: integer (nullable = true)
| | | |-- struct_2: struct (nullable = true)
| | | | |-- Index1: integer (nullable = true)

pyspark input read schema for array and struct to read json

I am trying to load some json file to pyspark with only specific columns like below
df = spark.read.json("sample/json/", schema=schema)
So I started writing a input read schema for below main schema
|-- test_name: string (nullable = true)
|-- test_file: string (nullable = true)
|-- test_id: string (nullable = true)
|-- test_type: string (nullable = true)
|-- test_url: string (nullable = true)
|-- test_ids: array (nullable = true)
| |-- element: string (containsNull = true)
|-- value: struct (nullable = true)
| |-- ct: long (nullable = true)
| |-- dimmingSetting: long (nullable = true)
| |-- hue: double (nullable = true)
| |-- modeId: string (nullable = true)
I tried to write for the direct string type but I am not able to write for array and struct type
schema = StructType([
StructField('test_name', StringType()),
StructField('test_file', StringType()),
StructField('test_id', StringType()),
StructField('test_type', StringType()),
StructField('test_url', StringType()),
])
How to extend this schema for
|-- test_ids: array (nullable = true)
|-- value: struct (nullable = true)
the extended version should be
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, LongType, DoubleType
schema = StructType([
StructField('test_name', StringType(), True),
StructField('test_file', StringType(), True),
StructField('test_id', StringType(), True),
StructField('test_type', StringType(), True),
StructField('test_url', StringType(), True),
StructField('test_ids', ArrayType(StringType(), True), True),
StructField('value', StructType([
StructField('ct', LongType(), True),
StructField('dimmingSetting', LongType(), True),
StructField('hue', DoubleType(), True),
StructField('modeId', StringType(), True)
])
)
])
I hope the answer is helpful

Resources