Creating a Hive schema in PySpark - apache-spark

Syntax for creating a schema in PySpark.
data.csv
id,name
1,sam
2,smith
val schema = new StructType().add("id", IntType).add("name", StringType)
val ds = spark.read.schema(schema).option("header", "true").csv("data.csv")
ds.show

define StructType with StructField(name, dataType, nullable=True)
from pyspark.sql.types you can import datatypes
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,FloatType,BooleanType
schema = StructType([
StructField("col_a", StringType(), True),
StructField("col_b", IntegerType(), True),
StructField("col_c", FloatType(), True),
StructField("col_d", BooleanType(), True)
])

Related

pyspark not finding database in spark-warehouse

I currently have a database called "bronze" with one table inside it that was created using almost the same code as below (just changing the TABLE_NAME and SCHEMA).
import findspark
findspark.init()
import delta
import os
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType
from pyspark.sql import SparkSession, window
from pyspark.sql import functions as F
from os.path import abspath
def upsertToDelta(df, batchId):
'''
In order to guarantee there aren't any duplicated matches, a Window is used to filter matches based on its GameId and UpdatedUtc.
The GameId is used as a group by and UpdatedUtc is used as an order by.
If it's found a duplicated match, the duplicate will be not be saved.
'''
windowSpec = window.Window.partitionBy("GameId").orderBy("UpdatedUtc") # .orderBy(1)
df_new = df.withColumn("row_number", F.row_number().over(windowSpec)).filter("row_number = 1")
( bronzeDeltaTable.alias("bronze")
.merge(df_new.alias("raw"), "bronze.GameId = raw.GameId")
.whenMatchedUpdateAll()
.whenNotMatchedInsertAll()
.execute()
)
warehouse_location = abspath('spark-warehouse')
builder = SparkSession.builder \
.master('local[*]') \
.config("spark.sql.warehouse.dir", warehouse_location) \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = delta.configure_spark_with_delta_pip(builder) \
.getOrCreate()
leaderboards_schema = StructType([
StructField("PlayerId", IntegerType(), False),
StructField("TeamId", IntegerType(), False),
StructField("Name", StringType(), False),
StructField("MatchName", StringType(), False),
StructField("Team", StringType(), False),
StructField("IsClosed", BooleanType(), False),
StructField("GameId", IntegerType(), False),
StructField("OpponentId", IntegerType(), False),
StructField("Opponent", StringType(), False),
StructField("Day", StringType(), True),
StructField("DateTime", StringType(), True),
StructField("Updated", StringType(), True),
StructField("UpdatedUtc", StringType(), True),
StructField("Games", StringType(), True),
StructField("Maps", FloatType(), True),
StructField("FantasyPoints", FloatType(), True),
StructField("Kills", FloatType(), True),
StructField("Assists", FloatType(), True),
StructField("Deaths", FloatType(), True),
StructField("Headshots", FloatType(), True),
StructField("AverageDamagePerRound", FloatType(), True),
StructField("Kast", FloatType(), True),
StructField("Rating", FloatType(), True),
StructField("EntryKills", FloatType(), True),
StructField("QuadKills", FloatType(), True),
StructField("Aces", FloatType(), True),
StructField("Clutch1v2s", FloatType(), True),
StructField("Clutch1v3s", FloatType(), True),
StructField("Clutch1v4s", FloatType(), True),
StructField("Clutch1v5s", FloatType(), True),
])
map_schema = StructType([
StructField("Number", IntegerType(), True),
StructField("Name", StringType(), True),
StructField("Status", StringType(), True),
StructField("CurrentRound", IntegerType(), True),
StructField("TeamAScore", IntegerType(), True),
StructField("TeamBScore", IntegerType(), True),
])
SCHEMAS = {
"tb_leaderboards": leaderboards_schema,
"tb_maps": map_schema
}
if "spark-warehouse" not in os.listdir():
spark.sql("CREATE DATABASE bronze")
try:
for TABLE_NAME in list(SCHEMAS.keys()):
'''
Full load
'''
if TABLE_NAME not in os.listdir('spark-warehouse/bronze.db'):
df = spark.read.parquet(f"raw/{TABLE_NAME}")
windowSpec = window.Window.partitionBy("GameId").orderBy("UpdatedUtc") # .orderBy(1)
df_new = df.withColumn("row_number", F.row_number().over(windowSpec)).filter("row_number = 1").drop("row_number")
df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}") # overwriting it's not overwrititng because it creates a different file name
# df_new.write.format("delta").saveAsTable(name=f"{warehouse_location}.bronze.{TABLE_NAME}", mode="overwrite")
# df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}")
bronzeDeltaTable = delta.tables.DeltaTable.forPath(spark, f"spark-warehouse/bronze.db/{TABLE_NAME}") #"bronze"
'''
When new matches lands in raw, a stream is responsible for saving these new matches in bronze.
'''
df_stream = ( spark.readStream
.format("parquet")
.schema(SCHEMAS[TABLE_NAME])
.load(f"raw/{TABLE_NAME}")
)
stream = ( df_stream.writeStream
.foreachBatch(upsertToDelta)
.option("checkpointLocation", f"spark-warehouse/bronze.db/{TABLE_NAME}_checkpoint")
.outputMode("update")
.start()
)
stream.processAllAvailable()
stream.stop()
finally:
spark.stop()
But when I execute the code above I'm getting the error pyspark.sql.utils.AnalysisException: Database 'bronze' not found. The error occurs when trying to execute df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}")
This is the current directory structure
I've already tried to include "spark-warehouse." before "bronze" as also add backquotes on "spark-warehouse", "bronze" and "{TABLE_NAME}" but nothing seems to work.
I'm running the code on Windows 10 with PySpark 3.3.1, Hadoop 3, delta-spark 2.2.0 and Java 11.0.16, but I also tested on Ubuntu 22.04 with the same config.
------------
Edit #1:
Asking ChatGPT for a solution to my problem, it suggested to use save() instead of saveAsTable(). So, changing df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}") to df_new.write.mode("overwrite").format("delta").save(f"spark-warehouse/bronze.db/{TABLE_NAME}") actually saves inside bronze database folder. However, if I run spark.sql("USE bronze") it still gives the same AnalysisException: Database 'bronze' not found error. Also, spark.sql("SHOW DATABASES").show() doesn't show bronze database, it only shows default.
------------
Any solutions to my problem ?
If anyone wants to test in your local machine, here's the repository.
Not too sure, but I think for saveAsTable, you need to set the write mode inside the method as an argument (pyspark.sql.DataFrameWriter.saveAsTable).
Try this:
df.write.format("delta").saveAsTable(
name=f"bronze.{TABLE_NAME}",
mode="overwrite"
)

pyspark json not able to inferschema for empty

In Pyspark, whenever i read a json file with an empty set element. The entire element is ignored in the resultant DataFrame.
Sample json :
{logs :[],pagination:{}}
And it only ignores the second element, i.e pagination in the above example. is there anyway to read the json with proper schema.?
Yes, you can perform in two ways with schema and without schema:
Reading Json with schema:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType
schema = StructType([StructField('email', StringType(), True),
StructField('first_name', StringType(), True),
StructField('gender', StringType(), True),
StructField('id', LongType(), True),
StructField('last_name', StringType(), True)])
df = spark.read.schema(schema).json(r'dbfs:/FileStore/MOCK_DATA__1_.json')
Reading Json Without schema
d1 = spark.read.json(r'dbfs:/FileStore/MOCK_DATA__1_.json')
d1.show()

Unable to read data from kafka topic

I'm a beginner in kafka. Trying to code a spark application to read data from a kafka topic created. Kafka topic1 is up & running.
Is there any problem with the code provided below:
val kafka_bootstrap_servers = "localhost:9092"
val users_df = spark.read
.format("kafka")
.option("kafka.bootstrap.servers", kafka_bootstrap_servers)
.option("subscribe", kafka_topic_name)
.load()
val users_df_1 = users_df.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS TIMESTAMP)")
val user_schema = StructType(
List(
StructField("RecordNumber", IntegerType, true),
StructField("Zipcode", StringType, true),
StructField("ZipCodeType", StringType, true),
StructField("City", StringType, true),
StructField("State", StringType, true),
StructField("LocationType", StringType, true),
StructField("Lat", StringType, true),
StructField("Long", StringType, true),
StructField("Xaxis", StringType, true),
StructField("Yaxis", StringType, true),
StructField("Zaxis", StringType, true),
StructField("WorldRegion", StringType, true),
StructField("Country", StringType, true),
StructField("LocationText", StringType, true),
StructField("Location", StringType, true),
StructField("Decommisioned", StringType, true)
)
)
val users_df_2 = users_df_1.select(from_json(col("RecordNumber"), user_schema)
.as("user_detail"), col("Zipcode"))
val users_df_3 = users_df_2.select(col = "user_detail.*", "Zipcode")
users_df_3.printSchema()
users_df_3.show(numRows = 10, truncate = false)
spark.stop()
println("Apache spark application completed.")
}
}
json data sample below
{"RecordNumber":76511,"Zipcode":27007,"ZipCodeType":"STANDARD","City":"ASH HILL","State":"NC","LocationType":"NOT ACCEPTABLE","Lat":36.4,"Long":-80.56,"Xaxis":0.13,"Yaxis":-0.79,"Zaxis":0.59,"WorldRegion":"NA","Country":"US","LocationText":"Ash Hill, NC","Location":"NA-US-NC-ASH HILL","Decommisioned":false,"TaxReturnsFiled":842,"EstimatedPopulation":1666,"TotalWages":28876493}
Error msg below
Exception in thread "main" org.apache.spark.sql.AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of "Structured Streaming + Kafka Integration Guide".;
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:652)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:194)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167)
at streamingApp$.main(streamingApp.scala:25)
at streamingApp.main(streamingApp.scala)
Need help to read data from kafka topic.
Please follow the guide for spark streaming + kafka integration
https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html
You might be missing the artifact "spark-sql-kafka-0-10_2.12"

How does the Databricks Delta Lake `mergeSchema` option handle differing data types?

What does the Databricks Delta Lake mergeSchema option do if a pre-existing column is appended with a different data type?
For example, given a Delta Lake table with schema foo INT, bar INT, what would happen when trying to write-append new data with schema foo INT, bar DOUBLE when specifying the option mergeSchema = true?
The write fails. (as of Delta Lake 0.5.0 on Databricks 6.3)
I think this is what you are looking for.
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType};
import org.apache.spark.sql.functions.input_file_name
val customSchema = StructType(Array(
StructField("field1", StringType, true),
StructField("field2", StringType, true),
StructField("field3", StringType, true),
StructField("field4", StringType, true),
StructField("field5", StringType, true),
StructField("field6", StringType, true),
StructField("field7", StringType, true)))
val df = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "false")
.option("sep", "|")
.schema(customSchema)
.load("mnt/rawdata/corp/ABC*.gz")
.withColumn("file_name", input_file_name())
Just name 'field1', 'field2', etc., as your actual field names. Also, the 'ABC*.gz' does a wildcard search for files beginning with a specific string, like 'abc', or whatever, and the '*' character, which means any combination of characters, up the the backslash and the '.gz' which means it's a zipped file. Yours could be different, of course, so just change that convention to meet your specific needs.

Pyspark: spark-submit not working like CLI

I have a pyspark to load data from a TSV file and save it as parquet file as well save it as a persistent SQL table.
When I run it line by line through pyspark CLI, it works exactly like expected. When I run it as as an application using spark-submit it runs without any errors but I get strange results: 1. the data is overwritten instead of appended. 2. When I run SQL queries against it I get no data returned even though the parquet files are several gigabytes in size (what I expect). Any suggestions?
Code:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import *
csv_file = '/srv/spark/data/input/ipfixminute2018-03-28.tsv'
parquet_dir = '/srv/spark/data/parquet/ipfixminute'
sc = SparkContext(appName='import-ipfixminute')
spark = SQLContext(sc)
fields = [StructField('time_stamp', TimestampType(), True),
StructField('subscriberId', StringType(), True),
StructField('sourceIPv4Address', StringType(), True),
StructField('destinationIPv4Address', StringType(), True),
StructField('service',StringType(), True),
StructField('baseService',StringType(), True),
StructField('serverHostname', StringType(), True),
StructField('rat', StringType(), True),
StructField('userAgent', StringType(), True),
StructField('accessPoint', StringType(), True),
StructField('station', StringType(), True),
StructField('device', StringType(), True),
StructField('contentCategories', StringType(), True),
StructField('incomingOctets', LongType(), True),
StructField('outgoingOctets', LongType(), True),
StructField('incomingShapingDrops', IntegerType(), True),
StructField('outgoingShapingDrops', IntegerType(), True),
StructField('qoeIncomingInternal', DoubleType(), True),
StructField('qoeIncomingExternal', DoubleType(), True),
StructField('qoeOutgoingInternal', DoubleType(), True),
StructField('qoeOutgoingExternal', DoubleType(), True),
StructField('incomingShapingLatency', DoubleType(), True),
StructField('outgoingShapingLatency', DoubleType(), True),
StructField('internalRtt', DoubleType(), True),
StructField('externalRtt', DoubleType(), True),
StructField('HttpUrl',StringType(), True)]
schema = StructType(fields)
df = spark.read.load(csv_file, format='csv',sep='\t',header=True,schema=schema,timestampFormat='yyyy-MM-dd HH:mm:ss')
df = df.drop('all')
df = df.withColumn('date',to_date('time_stamp'))
df.write.saveAsTable('test2',mode='append',partitionBy='date',path=parquet_dir)
As #user8371915 suggested it is similar to this:
Spark can access Hive table from pyspark but not from spark-submit
I needed to replace
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
with
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
This resolved this issue.

Resources