I currently have a database called "bronze" with one table inside it that was created using almost the same code as below (just changing the TABLE_NAME and SCHEMA).
import findspark
findspark.init()
import delta
import os
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType
from pyspark.sql import SparkSession, window
from pyspark.sql import functions as F
from os.path import abspath
def upsertToDelta(df, batchId):
'''
In order to guarantee there aren't any duplicated matches, a Window is used to filter matches based on its GameId and UpdatedUtc.
The GameId is used as a group by and UpdatedUtc is used as an order by.
If it's found a duplicated match, the duplicate will be not be saved.
'''
windowSpec = window.Window.partitionBy("GameId").orderBy("UpdatedUtc") # .orderBy(1)
df_new = df.withColumn("row_number", F.row_number().over(windowSpec)).filter("row_number = 1")
( bronzeDeltaTable.alias("bronze")
.merge(df_new.alias("raw"), "bronze.GameId = raw.GameId")
.whenMatchedUpdateAll()
.whenNotMatchedInsertAll()
.execute()
)
warehouse_location = abspath('spark-warehouse')
builder = SparkSession.builder \
.master('local[*]') \
.config("spark.sql.warehouse.dir", warehouse_location) \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = delta.configure_spark_with_delta_pip(builder) \
.getOrCreate()
leaderboards_schema = StructType([
StructField("PlayerId", IntegerType(), False),
StructField("TeamId", IntegerType(), False),
StructField("Name", StringType(), False),
StructField("MatchName", StringType(), False),
StructField("Team", StringType(), False),
StructField("IsClosed", BooleanType(), False),
StructField("GameId", IntegerType(), False),
StructField("OpponentId", IntegerType(), False),
StructField("Opponent", StringType(), False),
StructField("Day", StringType(), True),
StructField("DateTime", StringType(), True),
StructField("Updated", StringType(), True),
StructField("UpdatedUtc", StringType(), True),
StructField("Games", StringType(), True),
StructField("Maps", FloatType(), True),
StructField("FantasyPoints", FloatType(), True),
StructField("Kills", FloatType(), True),
StructField("Assists", FloatType(), True),
StructField("Deaths", FloatType(), True),
StructField("Headshots", FloatType(), True),
StructField("AverageDamagePerRound", FloatType(), True),
StructField("Kast", FloatType(), True),
StructField("Rating", FloatType(), True),
StructField("EntryKills", FloatType(), True),
StructField("QuadKills", FloatType(), True),
StructField("Aces", FloatType(), True),
StructField("Clutch1v2s", FloatType(), True),
StructField("Clutch1v3s", FloatType(), True),
StructField("Clutch1v4s", FloatType(), True),
StructField("Clutch1v5s", FloatType(), True),
])
map_schema = StructType([
StructField("Number", IntegerType(), True),
StructField("Name", StringType(), True),
StructField("Status", StringType(), True),
StructField("CurrentRound", IntegerType(), True),
StructField("TeamAScore", IntegerType(), True),
StructField("TeamBScore", IntegerType(), True),
])
SCHEMAS = {
"tb_leaderboards": leaderboards_schema,
"tb_maps": map_schema
}
if "spark-warehouse" not in os.listdir():
spark.sql("CREATE DATABASE bronze")
try:
for TABLE_NAME in list(SCHEMAS.keys()):
'''
Full load
'''
if TABLE_NAME not in os.listdir('spark-warehouse/bronze.db'):
df = spark.read.parquet(f"raw/{TABLE_NAME}")
windowSpec = window.Window.partitionBy("GameId").orderBy("UpdatedUtc") # .orderBy(1)
df_new = df.withColumn("row_number", F.row_number().over(windowSpec)).filter("row_number = 1").drop("row_number")
df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}") # overwriting it's not overwrititng because it creates a different file name
# df_new.write.format("delta").saveAsTable(name=f"{warehouse_location}.bronze.{TABLE_NAME}", mode="overwrite")
# df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}")
bronzeDeltaTable = delta.tables.DeltaTable.forPath(spark, f"spark-warehouse/bronze.db/{TABLE_NAME}") #"bronze"
'''
When new matches lands in raw, a stream is responsible for saving these new matches in bronze.
'''
df_stream = ( spark.readStream
.format("parquet")
.schema(SCHEMAS[TABLE_NAME])
.load(f"raw/{TABLE_NAME}")
)
stream = ( df_stream.writeStream
.foreachBatch(upsertToDelta)
.option("checkpointLocation", f"spark-warehouse/bronze.db/{TABLE_NAME}_checkpoint")
.outputMode("update")
.start()
)
stream.processAllAvailable()
stream.stop()
finally:
spark.stop()
But when I execute the code above I'm getting the error pyspark.sql.utils.AnalysisException: Database 'bronze' not found. The error occurs when trying to execute df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}")
This is the current directory structure
I've already tried to include "spark-warehouse." before "bronze" as also add backquotes on "spark-warehouse", "bronze" and "{TABLE_NAME}" but nothing seems to work.
I'm running the code on Windows 10 with PySpark 3.3.1, Hadoop 3, delta-spark 2.2.0 and Java 11.0.16, but I also tested on Ubuntu 22.04 with the same config.
------------
Edit #1:
Asking ChatGPT for a solution to my problem, it suggested to use save() instead of saveAsTable(). So, changing df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}") to df_new.write.mode("overwrite").format("delta").save(f"spark-warehouse/bronze.db/{TABLE_NAME}") actually saves inside bronze database folder. However, if I run spark.sql("USE bronze") it still gives the same AnalysisException: Database 'bronze' not found error. Also, spark.sql("SHOW DATABASES").show() doesn't show bronze database, it only shows default.
------------
Any solutions to my problem ?
If anyone wants to test in your local machine, here's the repository.
Not too sure, but I think for saveAsTable, you need to set the write mode inside the method as an argument (pyspark.sql.DataFrameWriter.saveAsTable).
Try this:
df.write.format("delta").saveAsTable(
name=f"bronze.{TABLE_NAME}",
mode="overwrite"
)
Related
I am trying to write the stream from kafka using pyspark.
It gives me a warning like this
WARN NetworkClient: [Consumer clientId=consumer-spark-kafka-source-a422a51e-b0ef-45cc-bd5c-c327b8881359-1275625627-driver-0-1, groupId=spark-kafka-source-a422a51e-b0ef-45cc-bd5c-c327b8881359-1275625627-driver-0] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. WARN NetworkClient: [Consumer clientId=consumer-spark-kafka-source-a422a51e-b0ef-45cc-bd5c-c327b8881359-1275625627-driver-0-1, groupId=spark-kafka-source-a422a51e-b0ef-45cc-bd5c-c327b8881359-1275625627-driver-0] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected and so on ...
I have checked my server.properties and zookeeper.properties, it seems everything is fine
here is the python code that I am using
def retrieve_schema(Topic: str) -> str:
# schema registry
schema_registry_server = "localhost"
schema_registry_port = "8081"
schema_registry_url = "http://" + schema_registry_server + ":" + schema_registry_port
response_schema =requests.get('{}/subjects/{}value/versions/latest/schema'.format(schema_registry_url, Topic))
response_schema.raise_for_status()
schema = response_schema.text
return schema
if __name__ == '__main__':
sparkDir = "C:\spark\spark-3.1.3-bin-hadoop2.7"
findspark.init(sparkDir)
spark = SparkSession.builder.appName("Kafka Pyspark Streaming").getOrCreate()
topic_name = "ora-WALLET_TRANSACTIONS"
kafka_bootstrap_server = "localhost:9092"
# create the schema as a json format
mySchema_json = retrieve_schema(topic_name)
sampleDataFrame = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", kafka_bootstrap_server) \
.option("subscribe", topic_name) \
.option("startingOffsets", "earliest") \
.load()
sampleDataFrame.printSchema()
avroValues = sampleDataFrame.select(from_avro(data=col("value"), jsonFormatSchema=mySchema_json).alias("value"))
avroData = avroValues.select("value.TRX_ID", "value.ACCOUNT_NUMBER",
"value.TRANSACTION_AMOUNT", "value.TRANSACTION_DATE")
avroData.printSchema()
print("\nStart writing the stream")
dStream = avroData\
.writeStream\
.queryName("data")\
.format("console")\
.start()
#
dStream.awaitTermination()
EDIT:
I switch from avro to json converter and I added to the previous code the StructType schema that matchs the records in topic
mySchema = StructType([
StructField("TRX_ID", StringType(), True),
StructField("ACCOUNT_NUMBER", StringType(), True),
StructField("TRANSACTION_AMOUNT", DoubleType(), True),
StructField("TRANSACTION_CURRENCY", StringType(), True),
StructField("TRANSACTION_DATE", TimestampType(), True),
StructField("TRANSACTION_REFERENCE_ARN", StringType(), True),
StructField("REFERENCE_NUMBER", StringType(), True),
StructField("WALLET_NUMBER", StringType(), True),
StructField("SOURCE_ACCOUNT_NUMBER", StringType(), True),
StructField("DESTINATION", StringType(), True),
StructField("FC_REFERENCE_NUMBER", StringType(), True),
StructField("TRANSACTION_TYPE", StringType(), True),
StructField("PROVIDER", StringType(), True),
StructField("DEST_BANK_CODE", StringType(), True),
StructField("BILL_KEY", StringType(), True),
StructField("GHIPS_REF", StringType(), True),
StructField("USER_CODE", StringType(), True),
StructField("BANK_CODE", StringType(), True)
])
And I used from_json instead of from_avro i get the stream back it has null values
I am trying to read a json file, convert it to CSV in PySpark as below.
df = spark.read.json(inputdir')
I have the below schema which I am imposing on my dataframe.
mechanic_schema = StructType([
StructField("name", StringType(), True),
StructField("some_other_column", StringType(), True),
StructField("url", StringType(), True),
StructField("image", StringType(), True),
StructField("startTime", StringType(), True),
StructField("recipeYield", StringType(), True),
StructField("datePublished", StringType(), True),
StructField("endTime", StringType(), True),
StructField("description", StringType(), True)
])
I am saving the dataframe: df in an output directory as below.
df.select(mechanic_schema.names).write.format('csv').option("header","true").save(''/Users/bobby/Desktop/output/', header='true')
This is how the output looks like:
df.show()
Now in another script, I am reading the same csv file that I saved in output path of df as below:
df = spark.read.format('csv').option('header', True).load('/Users/bobby/Desktop/output/')
df.show()
But strangely, the output contains so many columns as nulls which looks like this:
So I checked my output CSV file and the data looks exactly fine there.
I have never come across this phenomenon until now and don't understand what did I do wrong here.
Could anyone let me know what is causing this issue and how can fix this problem ?
Any help is appreciated.
I am trying to read a CSV (tilda separated) of which one of the column is a json. When i want to convert the same to a struct type I am getting the exception.
hashes_event_struct = StructType([
StructField("calculatedHash",StringType(), True),
StructField("canonHash",StringType(), True),
StructField("match",BooleanType(), True)])
hashes_metadata_struct = StructType([
StructField("calculatedHash",StringType(), True),
StructField("canonHash",StringType(), True),
StructField("match",BooleanType(), True)
])
hashes_originator_struct = StructType([
StructField("calculatedHash",StringType(), True),
StructField("canonHash",StringType(), True),
StructField("match",BooleanType(), True)
])
hashes = StructType([
StructField("events",StructType(hashes_event_struct), True),
StructField("metadata",StructType(hashes_metadata_struct), True),
StructField("originator",StructType(hashes_originator_struct), True)
])
cryptoDetail = StructType([
StructField("OTPSource",StringType(), True),
StructField("hashes",StructType(hashes), True),
StructField("highWaterMark",IntegerType(), True),
StructField("highWaterMarkOK",BooleanType(), True),
StructField("keyIndex",IntegerType(), True),
StructField("payloadManifestJec",StringType(), True),
StructField("version",StringType(), True)
])
crypto = StructType([
StructField("cryptoDetail",StructType(cryptoDetail), True),
StructField("cryptoDetailType",StringType(), True)
])
ip_schema_n = StructType([
StructField("crypto",StructType(crypto), True),
StructField("eventDetailType",StringType(), True),
StructField("originatorDetailError",StringType(), True),
StructField("rawJson",StringType(), True)
])
ipdata = 'C:\\Users\\snds\\Downloads\\input-data\\posts_data_crypto.csv'
df_ds_1 = spark.read.format("csv").option("header","True").schema(ip_schema_n).option("sep","~").load(ipdata)
Below is the Exception
135 # Hide where the exception came from that shows a non-Pythonic
136 # JVM exception message.
--> 137 raise_from(converted)
138 else:
139 raise
c:\Spark3\python\pyspark\sql\utils.py in raise_from(e)
AnalysisException: CSV data source does not support struct<cryptoDetail:struct<OTPSource:string,hashes:struct<events:struct<calculatedHash:string,canonHash:string,match:boolean>,metadata:struct<calculatedHash:string,canonHash:string,match:boolean>,originator:struct<calculatedHash:string,canonHash:string,match:boolean>>,highWaterMark:int,highWaterMarkOK:boolean,keyIndex:int,payloadManifestJec:string,version:string>,cryptoDetailType:string> data type.;
The input data is as below
event,eventDetailType,originatorDetailError,rawJson
{"crypto":{"cryptoDetail":{"OTPSource":"cached","hashes":{"events":{"calculatedHash":"yyyyyyyyyy","canonHash":"yyyyyyyyy","match":true},"metadata":{"calculatedHash":"iiiiiiiiiiiii","canonHash":"kkkkkkkkkkkk","match":true},"originator":{"calculatedHash":"eeeeeeeeeee","canonHash":"ttttttttt","match":true}},"highWaterMark":24,"highWaterMarkOK":true,"keyIndex":24,"payloadManifestJec":"yyyyyy","version":"0.0.1"},"cryptoDetailType":"com.pk.KKK.domain.fish.service.version.1"}}~com.pk.fishing.domain.version.1~null~null
Is there any way i can convert it to a proper struct. Thank you for your help.
I have a pyspark to load data from a TSV file and save it as parquet file as well save it as a persistent SQL table.
When I run it line by line through pyspark CLI, it works exactly like expected. When I run it as as an application using spark-submit it runs without any errors but I get strange results: 1. the data is overwritten instead of appended. 2. When I run SQL queries against it I get no data returned even though the parquet files are several gigabytes in size (what I expect). Any suggestions?
Code:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import *
csv_file = '/srv/spark/data/input/ipfixminute2018-03-28.tsv'
parquet_dir = '/srv/spark/data/parquet/ipfixminute'
sc = SparkContext(appName='import-ipfixminute')
spark = SQLContext(sc)
fields = [StructField('time_stamp', TimestampType(), True),
StructField('subscriberId', StringType(), True),
StructField('sourceIPv4Address', StringType(), True),
StructField('destinationIPv4Address', StringType(), True),
StructField('service',StringType(), True),
StructField('baseService',StringType(), True),
StructField('serverHostname', StringType(), True),
StructField('rat', StringType(), True),
StructField('userAgent', StringType(), True),
StructField('accessPoint', StringType(), True),
StructField('station', StringType(), True),
StructField('device', StringType(), True),
StructField('contentCategories', StringType(), True),
StructField('incomingOctets', LongType(), True),
StructField('outgoingOctets', LongType(), True),
StructField('incomingShapingDrops', IntegerType(), True),
StructField('outgoingShapingDrops', IntegerType(), True),
StructField('qoeIncomingInternal', DoubleType(), True),
StructField('qoeIncomingExternal', DoubleType(), True),
StructField('qoeOutgoingInternal', DoubleType(), True),
StructField('qoeOutgoingExternal', DoubleType(), True),
StructField('incomingShapingLatency', DoubleType(), True),
StructField('outgoingShapingLatency', DoubleType(), True),
StructField('internalRtt', DoubleType(), True),
StructField('externalRtt', DoubleType(), True),
StructField('HttpUrl',StringType(), True)]
schema = StructType(fields)
df = spark.read.load(csv_file, format='csv',sep='\t',header=True,schema=schema,timestampFormat='yyyy-MM-dd HH:mm:ss')
df = df.drop('all')
df = df.withColumn('date',to_date('time_stamp'))
df.write.saveAsTable('test2',mode='append',partitionBy='date',path=parquet_dir)
As #user8371915 suggested it is similar to this:
Spark can access Hive table from pyspark but not from spark-submit
I needed to replace
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
with
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
This resolved this issue.
For some reason, Spark is writing blank files. Not sure what I've been doing incorrectly.
from pyspark.sql import SparkSession, DataFrame, DataFrameWriter, functions as F
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, TimestampType
if __name__ == "__main__":
print('start')
spark = SparkSession \
.builder \
.appName("testing") \
.config("spark.ui.enabled", "true") \
.master("yarn-client") \
.getOrCreate()
myschema = StructType([\
StructField("field1", TimestampType(), True), \
StructField("field2", TimestampType(), True), \
StructField("field3", StringType(), True),
StructField("field4", StringType(), True), \
StructField("field5", StringType(), True), \
StructField("field6", IntegerType(), True), \
])
df = spark.read.load("s3a://bucket/file.csv",\
format="csv", \
sep=",", \
# inferSchema="true", \
timestampFormat="MM/dd/yyyy HH:mm:ss",
header="true",
schema=myschema
)
print(df.count()) #output is 50
df.write.csv(path="s3a://bucket/folder",\
header="true"
)
Output from the print statement is 50, which is correct. But the output file on S3 has just a file with headers without any data. Should I be adding another option to the write function? I'm not sure why I'm not seeing any data being written