Spark is writing blank files - apache-spark

For some reason, Spark is writing blank files. Not sure what I've been doing incorrectly.
from pyspark.sql import SparkSession, DataFrame, DataFrameWriter, functions as F
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, TimestampType
if __name__ == "__main__":
print('start')
spark = SparkSession \
.builder \
.appName("testing") \
.config("spark.ui.enabled", "true") \
.master("yarn-client") \
.getOrCreate()
myschema = StructType([\
StructField("field1", TimestampType(), True), \
StructField("field2", TimestampType(), True), \
StructField("field3", StringType(), True),
StructField("field4", StringType(), True), \
StructField("field5", StringType(), True), \
StructField("field6", IntegerType(), True), \
])
df = spark.read.load("s3a://bucket/file.csv",\
format="csv", \
sep=",", \
# inferSchema="true", \
timestampFormat="MM/dd/yyyy HH:mm:ss",
header="true",
schema=myschema
)
print(df.count()) #output is 50
df.write.csv(path="s3a://bucket/folder",\
header="true"
)
Output from the print statement is 50, which is correct. But the output file on S3 has just a file with headers without any data. Should I be adding another option to the write function? I'm not sure why I'm not seeing any data being written

Related

pyspark not finding database in spark-warehouse

I currently have a database called "bronze" with one table inside it that was created using almost the same code as below (just changing the TABLE_NAME and SCHEMA).
import findspark
findspark.init()
import delta
import os
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType
from pyspark.sql import SparkSession, window
from pyspark.sql import functions as F
from os.path import abspath
def upsertToDelta(df, batchId):
'''
In order to guarantee there aren't any duplicated matches, a Window is used to filter matches based on its GameId and UpdatedUtc.
The GameId is used as a group by and UpdatedUtc is used as an order by.
If it's found a duplicated match, the duplicate will be not be saved.
'''
windowSpec = window.Window.partitionBy("GameId").orderBy("UpdatedUtc") # .orderBy(1)
df_new = df.withColumn("row_number", F.row_number().over(windowSpec)).filter("row_number = 1")
( bronzeDeltaTable.alias("bronze")
.merge(df_new.alias("raw"), "bronze.GameId = raw.GameId")
.whenMatchedUpdateAll()
.whenNotMatchedInsertAll()
.execute()
)
warehouse_location = abspath('spark-warehouse')
builder = SparkSession.builder \
.master('local[*]') \
.config("spark.sql.warehouse.dir", warehouse_location) \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
spark = delta.configure_spark_with_delta_pip(builder) \
.getOrCreate()
leaderboards_schema = StructType([
StructField("PlayerId", IntegerType(), False),
StructField("TeamId", IntegerType(), False),
StructField("Name", StringType(), False),
StructField("MatchName", StringType(), False),
StructField("Team", StringType(), False),
StructField("IsClosed", BooleanType(), False),
StructField("GameId", IntegerType(), False),
StructField("OpponentId", IntegerType(), False),
StructField("Opponent", StringType(), False),
StructField("Day", StringType(), True),
StructField("DateTime", StringType(), True),
StructField("Updated", StringType(), True),
StructField("UpdatedUtc", StringType(), True),
StructField("Games", StringType(), True),
StructField("Maps", FloatType(), True),
StructField("FantasyPoints", FloatType(), True),
StructField("Kills", FloatType(), True),
StructField("Assists", FloatType(), True),
StructField("Deaths", FloatType(), True),
StructField("Headshots", FloatType(), True),
StructField("AverageDamagePerRound", FloatType(), True),
StructField("Kast", FloatType(), True),
StructField("Rating", FloatType(), True),
StructField("EntryKills", FloatType(), True),
StructField("QuadKills", FloatType(), True),
StructField("Aces", FloatType(), True),
StructField("Clutch1v2s", FloatType(), True),
StructField("Clutch1v3s", FloatType(), True),
StructField("Clutch1v4s", FloatType(), True),
StructField("Clutch1v5s", FloatType(), True),
])
map_schema = StructType([
StructField("Number", IntegerType(), True),
StructField("Name", StringType(), True),
StructField("Status", StringType(), True),
StructField("CurrentRound", IntegerType(), True),
StructField("TeamAScore", IntegerType(), True),
StructField("TeamBScore", IntegerType(), True),
])
SCHEMAS = {
"tb_leaderboards": leaderboards_schema,
"tb_maps": map_schema
}
if "spark-warehouse" not in os.listdir():
spark.sql("CREATE DATABASE bronze")
try:
for TABLE_NAME in list(SCHEMAS.keys()):
'''
Full load
'''
if TABLE_NAME not in os.listdir('spark-warehouse/bronze.db'):
df = spark.read.parquet(f"raw/{TABLE_NAME}")
windowSpec = window.Window.partitionBy("GameId").orderBy("UpdatedUtc") # .orderBy(1)
df_new = df.withColumn("row_number", F.row_number().over(windowSpec)).filter("row_number = 1").drop("row_number")
df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}") # overwriting it's not overwrititng because it creates a different file name
# df_new.write.format("delta").saveAsTable(name=f"{warehouse_location}.bronze.{TABLE_NAME}", mode="overwrite")
# df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}")
bronzeDeltaTable = delta.tables.DeltaTable.forPath(spark, f"spark-warehouse/bronze.db/{TABLE_NAME}") #"bronze"
'''
When new matches lands in raw, a stream is responsible for saving these new matches in bronze.
'''
df_stream = ( spark.readStream
.format("parquet")
.schema(SCHEMAS[TABLE_NAME])
.load(f"raw/{TABLE_NAME}")
)
stream = ( df_stream.writeStream
.foreachBatch(upsertToDelta)
.option("checkpointLocation", f"spark-warehouse/bronze.db/{TABLE_NAME}_checkpoint")
.outputMode("update")
.start()
)
stream.processAllAvailable()
stream.stop()
finally:
spark.stop()
But when I execute the code above I'm getting the error pyspark.sql.utils.AnalysisException: Database 'bronze' not found. The error occurs when trying to execute df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}")
This is the current directory structure
I've already tried to include "spark-warehouse." before "bronze" as also add backquotes on "spark-warehouse", "bronze" and "{TABLE_NAME}" but nothing seems to work.
I'm running the code on Windows 10 with PySpark 3.3.1, Hadoop 3, delta-spark 2.2.0 and Java 11.0.16, but I also tested on Ubuntu 22.04 with the same config.
------------
Edit #1:
Asking ChatGPT for a solution to my problem, it suggested to use save() instead of saveAsTable(). So, changing df_new.write.mode("overwrite").format("delta").saveAsTable(f"bronze.{TABLE_NAME}") to df_new.write.mode("overwrite").format("delta").save(f"spark-warehouse/bronze.db/{TABLE_NAME}") actually saves inside bronze database folder. However, if I run spark.sql("USE bronze") it still gives the same AnalysisException: Database 'bronze' not found error. Also, spark.sql("SHOW DATABASES").show() doesn't show bronze database, it only shows default.
------------
Any solutions to my problem ?
If anyone wants to test in your local machine, here's the repository.
Not too sure, but I think for saveAsTable, you need to set the write mode inside the method as an argument (pyspark.sql.DataFrameWriter.saveAsTable).
Try this:
df.write.format("delta").saveAsTable(
name=f"bronze.{TABLE_NAME}",
mode="overwrite"
)

Issue in writing records in into MYSQL from Spark Structured Streaming Dataframe

I am using below code to write spark Streaming dataframe into MQSQL DB .Below is the kafka topic JSON data format and MYSQL table schema.Column name and types are same to same.
But I am unable to see records written in MYSQL table. Table is empty with zero records.Please suggest.
Kafka Topic Data Fomat
ssingh#RENLTP2N073:/mnt/d/confluent-6.0.0/bin$ ./kafka-console-consumer --topic sarvtopic --from-beginning --bootstrap-server localhost:9092
{"id":1,"firstname":"James ","middlename":"","lastname":"Smith","dob_year":2018,"dob_month":1,"gender":"M","salary":3000}
{"id":2,"firstname":"Michael ","middlename":"Rose","lastname":"","dob_year":2010,"dob_month":3,"gender":"M","salary":4000}
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("SSKafka") \
.getOrCreate()
dsraw = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "sarvtopic") \
.option("startingOffsets", "earliest") \
.load()
ds = dsraw.selectExpr("CAST(value AS STRING)")
dsraw.printSchema()
from pyspark.sql.types import StructField, StructType, StringType,LongType
from pyspark.sql.functions import *
custom_schema = StructType([
StructField("id", LongType(), True),
StructField("firstname", StringType(), True),
StructField("middlename", StringType(), True),
StructField("lastname", StringType(), True),
StructField("dob_year", StringType(), True),
StructField("dob_month", LongType(), True),
StructField("gender", StringType(), True),
StructField("salary", LongType(), True),
])
Person_details_df2 = ds\
.select(from_json(col("value"), custom_schema).alias("Person_details"))
Person_details_df3 = Person_details_df2.select("Person_details.*")
from pyspark.sql import DataFrameWriter
def foreach_batch_function(df, epoch_id):
Person_details_df3.write.jdbc(url='jdbc:mysql://172.16.23.27:30038/securedb', driver='com.mysql.jdbc.Driver', dbtable="sparkkafka", user='root',password='root$1234')
pass
query = Person_details_df3.writeStream.trigger(processingTime='20 seconds').outputMode("append").foreachBatch(foreach_batch_function).start()
query
Out[14]: <pyspark.sql.streaming.StreamingQuery at 0x1fb25503b08>
MYSQL table Schema:
create table sparkkafka(
id int,
firstname VARCHAR(40) NOT NULL,
middlename VARCHAR(40) NOT NULL,
lastname VARCHAR(40) NOT NULL,
dob_year int(40) NOT NULL,
dob_month int(40) NOT NULL,
gender VARCHAR(40) NOT NULL,
salary int(40) NOT NULL,
PRIMARY KEY (id)
);
I presume Person_details_df3 is your streaming dataframe and your spark version is above 2.4.0 version.
To use foreachBatch API write as below:
db_target_properties = {"user":"xxxx", "password":"yyyyy"}
def foreach_batch_function(df, epoch_id):
df.write.jdbc(url='jdbc:mysql://172.16.23.27:30038/securedb', table="sparkkafka", properties=db_target_properties)
pass
query = Person_details_df3.writeStream.outputMode("append").foreachBatch(foreach_batch_function).start()
query.awaitTermination()

Removing Blank fields from Spark Dataframe

I use spark structured streaming to consume a kafka topic which has several type of message(different schema of each type). I define a schema which has all fields for different kind of message.
How can i filter empty fields from dataframe for each row, or how can i read dataframe from kafka with dynamic schema.
val inputDS = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "overview")
.load()
val schemaa: StructType = StructType(
Array(
StructField("title", StringType, true),
StructField("url", StringType, true),
StructField("content", StringType, true),
StructField("collect_time", StringType, true),
StructField("time", StringType, true),
StructField("user_head", StringType, true),
StructField("image", StringType, true)
)
)
inputDS.withColumn("value", from_json($"value".cast(StringType), schemaa))
//.filter() // todo filter empty field
.writeStream
.format("console")
.start()
.awaitTermination()

How to check if n consecutive events from kafka stream is greater or less than threshold limit

I an new to pyspark. I have written a pyspark program to read kafka stream using window operation. I am publishing the below message to kafka every second with different sources and temperatures along with the timestamp.
{"temperature":34,"time":"2019-04-17 12:53:02","source":"1010101"}
{"temperature":29,"time":"2019-04-17 12:53:03","source":"1010101"}
{"temperature":28,"time":"2019-04-17 12:53:04","source":"1010101"}
{"temperature":34,"time":"2019-04-17 12:53:05","source":"1010101"}
{"temperature":45,"time":"2019-04-17 12:53:06","source":"1010101"}
{"temperature":34,"time":"2019-04-17 12:53:07","source":"1010102"}
{"temperature":29,"time":"2019-04-17 12:53:08","source":"1010102"}
{"temperature":28,"time":"2019-04-17 12:53:09","source":"1010102"}
{"temperature":34,"time":"2019-04-17 12:53:10","source":"1010102"}
{"temperature":45,"time":"2019-04-17 12:53:11","source":"1010102"}
How do I check if n consecutive temperature records for a source crosses threshold limit (<30 and >40) and then publish the alerts to Kafka. Also please let me know if the below program is efficient to read the kafka stream or require any changes?
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
from pyspark.sql.functions import avg, window, from_json, from_unixtime, unix_timestamp
import uuid
schema = StructType([
StructField("source", StringType(), True),
StructField("temperature", FloatType(), True),
StructField("time", StringType(), True)
])
spark = SparkSession \
.builder.master("local[8]") \
.appName("test-app") \
.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", 5)
df1 = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "test") \
.load() \
.selectExpr("CAST(value AS STRING)")
df2 = df1.select(from_json("value", schema).alias(
"sensors")).select("sensors.*")
df3 = df2.select(df2.source, df2.temperature, from_unixtime(
unix_timestamp(df2.time, 'yyyy-MM-dd HH:mm:ss')).alias('time'))
df4 = df3.groupBy(window(df3.time, "2 minutes", "1 minutes"),
df3.source).agg(avg("temperature"))
query1 = df4.writeStream \
.outputMode("complete") \
.format("console") \
.option("checkpointLocation", "/tmp/temporary-" + str(uuid.uuid4())) \
.start()
query1.awaitTermination()

Pyspark: spark-submit not working like CLI

I have a pyspark to load data from a TSV file and save it as parquet file as well save it as a persistent SQL table.
When I run it line by line through pyspark CLI, it works exactly like expected. When I run it as as an application using spark-submit it runs without any errors but I get strange results: 1. the data is overwritten instead of appended. 2. When I run SQL queries against it I get no data returned even though the parquet files are several gigabytes in size (what I expect). Any suggestions?
Code:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import *
csv_file = '/srv/spark/data/input/ipfixminute2018-03-28.tsv'
parquet_dir = '/srv/spark/data/parquet/ipfixminute'
sc = SparkContext(appName='import-ipfixminute')
spark = SQLContext(sc)
fields = [StructField('time_stamp', TimestampType(), True),
StructField('subscriberId', StringType(), True),
StructField('sourceIPv4Address', StringType(), True),
StructField('destinationIPv4Address', StringType(), True),
StructField('service',StringType(), True),
StructField('baseService',StringType(), True),
StructField('serverHostname', StringType(), True),
StructField('rat', StringType(), True),
StructField('userAgent', StringType(), True),
StructField('accessPoint', StringType(), True),
StructField('station', StringType(), True),
StructField('device', StringType(), True),
StructField('contentCategories', StringType(), True),
StructField('incomingOctets', LongType(), True),
StructField('outgoingOctets', LongType(), True),
StructField('incomingShapingDrops', IntegerType(), True),
StructField('outgoingShapingDrops', IntegerType(), True),
StructField('qoeIncomingInternal', DoubleType(), True),
StructField('qoeIncomingExternal', DoubleType(), True),
StructField('qoeOutgoingInternal', DoubleType(), True),
StructField('qoeOutgoingExternal', DoubleType(), True),
StructField('incomingShapingLatency', DoubleType(), True),
StructField('outgoingShapingLatency', DoubleType(), True),
StructField('internalRtt', DoubleType(), True),
StructField('externalRtt', DoubleType(), True),
StructField('HttpUrl',StringType(), True)]
schema = StructType(fields)
df = spark.read.load(csv_file, format='csv',sep='\t',header=True,schema=schema,timestampFormat='yyyy-MM-dd HH:mm:ss')
df = df.drop('all')
df = df.withColumn('date',to_date('time_stamp'))
df.write.saveAsTable('test2',mode='append',partitionBy='date',path=parquet_dir)
As #user8371915 suggested it is similar to this:
Spark can access Hive table from pyspark but not from spark-submit
I needed to replace
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
with
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
This resolved this issue.

Resources