Pyspark Structured streaming processing - apache-spark

I am trying to make a structured streaming application with spark the main idea is to read from a kafka source, process the input, write back to another topic. i have successfully made spark read and write from and to kafka however my problem is with the processing part. I have tried the foreach function to capture every row and process it before writing back to kafka however it always only does the foreach part and never writes back to kafka. If i however remove the foreach part from the writestream it would continue writing but now i lost my processing.
if anyone can give me an example on how to do this with an example i would be extremely grateful.
here is my code
spark = SparkSession \
.builder \
.appName("StructuredStreamingTrial") \
.getOrCreate()
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "KafkaStreamingSource") \
.load()
ds = df \
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")\
.writeStream \
.outputMode("update") \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("topic", "StreamSink") \
.option("checkpointLocation", "./testdir")\
.foreach(foreach_function)
.start().awaitTermination()
and the foreach_function simply is
def foreach_function(df):
try:
print(df)
except:
print('fail')
pass

Processing the data before writing into Kafka sink in Pyspark based Structured Streaming API,we can easily handle with UDF function for any kind of complex transformation .
example code is in below . This code is trying to read the JSON format message Kafka topic and parsing the message to convert the message from JSON into CSV format and rewrite into another topic. You can handle any processing transformation in place of 'json_formatted' function .
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.streaming import StreamingContext
from pyspark.sql.column import Column, _to_java_column
from pyspark.sql.functions import col, struct
from pyspark.sql.functions import udf
import json
import csv
import time
import os
# Spark Streaming context :
spark = SparkSession.builder.appName('pda_inst_monitor_status_update').getOrCreate()
sc = spark.sparkContext
ssc = StreamingContext(sc, 20)
# Creating readstream DataFrame :
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "KafkaStreamingSource") \
.load()
df1 = df.selectExpr( "CAST(value AS STRING)")
df1.registerTempTable("test")
def json_formatted(s):
val_dict = json.loads(s)
return str([
val_dict["after"]["ID"]
, val_dict["after"]["INST_NAME"]
, val_dict["after"]["DB_UNIQUE_NAME"]
, val_dict["after"]["DBNAME"]
, val_dict["after"]["MON_START_TIME"]
, val_dict["after"]["MON_END_TIME"]
]).strip('[]').replace("'","").replace('"','')
spark.udf.register("JsonformatterWithPython", json_formatted)
squared_udf = udf(json_formatted)
df1 = spark.table("test")
df2 = df1.select(squared_udf("value"))
# Declaring the Readstream Schema DataFrame :
df2.coalesce(1).writeStream \
.writeStream \
.outputMode("update") \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("topic", "StreamSink") \
.option("checkpointLocation", "./testdir")\
.start()
ssc.awaitTermination()

Related

Unable to Connect to Apache Spark Kafka Streaming Using SSL on EMR Notebooks

I'm trying to connect to a Kafka consumer secured by SSL using spark structured streaming but I am having issues. I have the Kafka consumer working and reading events using confluent_kafka with the following code:
conf = {'bootstrap.servers': 'url:port',
'group.id': 'group1',
'enable.auto.commit': False,
'security.protocol': 'SSL',
'ssl.key.location': 'file1.key',
'ssl.ca.location': 'file2.pem',
'ssl.certificate.location': 'file3.cert',
'auto.offset.reset': 'earliest'
}
consumer = Consumer(conf)
consumer.subscribe(['my_topic'])
# Reads events without issues
msg = consumer.poll(timeout=0)
I'm having issues replicating this code with Spark Structured Streaming on EMR Notebooks.
This is the current setup I have on EMR Notebooks:
%%configure -f
{
"conf": {
"spark.jars.packages": "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0",
"livy.rsc.server.connect.timeout":"600s",
"spark.pyspark.python": "python3",
"spark.pyspark.virtualenv.enabled": "true",
"spark.pyspark.virtualenv.type":"native",
"spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
}
}
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkFiles
cert_file = 'file3.cert'
pem_file = 'file2.pem'
key_file = 'file3.key'
sc.addFile(f's3://.../{cert_file}')
sc.addFile(f's3://.../{pem_file}')
sc.addFile(f's3://.../{key_file}')
spark = SparkSession\
.builder \
.getOrCreate()
kafka_df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "url:port") \
.option("kafka.group.id", "group1") \
.option("enable.auto.commit", "False") \
.option("kafka.security.protocol", "SSL") \
.option("kafka.ssl.key.location", SparkFiles.get(key_file)) \ # SparkFiles.get() works
.option("kafka.ssl.ca.location", SparkFiles.get(pem_file)) \
.option("kafka.ssl.certificate.location", SparkFiles.get(cert_file)) \
.option("startingOffsets", "earliest") \
.option("subscribe", "my_topic") \
.load()
query = kafka_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
.writeStream \
.outputMode("append") \
.format("console") \
.start()
And no rows appear in the Structured Streaming tab in the spark UI even though I expect the rows to show up instantly since I am using the earliest startingOffsets.
My hypothesis is that readStream doesn't work because the SSL information is not set up correctly. I've looked and haven't found .option() parameters that directly correspond to the confluent_kafka API.
Any help would be appreciated.

Spark streaming: get the max values

Hi I am triying to get the most repeated values from a stream data.
In order to do this I have the following code:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import regexp_extract, col
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'
spark = SparkSession \
.builder \
.appName("SSKafka") \
.getOrCreate()
df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", 'localhost:9092') \
.option("subscribe", 'twitter') \
.option("startingTimestamp", 1000) \
.option("startingOffsets", "earliest") \
.load()
ds = df \
.selectExpr("CAST(value AS STRING)", "timestamp") \
.select(regexp_extract(col('value'), '#(\w+)', 1).alias('hashtags'), 'timestamp')
df_group = ds.withWatermark("timestamp", "5 seconds") \
.groupBy(
'timestamp',
'hashtags'
).agg(
F.count(col('hashtags')).alias('total')
)
query = df_group \
.writeStream \
.outputMode("append") \
.format("console") \
.option("truncate", "False") \
.start()
query.awaitTermination()
The idea is to process a batch of 5 seconds, and show when each batch is processed the current top hashtags most used.
The main idea was using this code without group by timestamp, but I got an error, about that if ds doesn't use timestamp then df_group doesn't use outputMode("append"), and I want to show the update.
Is this possible, how can I do it?
Thanks.

Kafka and pyspark program: Unable to determine why dataframe is empty

Below is my first program working with kafka and pyspark. The code seems to run without exceptions, but the output of my query is empty.
I am initiating spark and kafka. Later, in Kafka initiation, I subscribed the topic = "quickstart-events" and from terminal produced messages for this topic. But when I run this code, it gives me blank dataframes.
How do I resolve?
Code:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession, DataFrame
from pyspark.sql.types import StructType, ArrayType, StructField, IntegerType, StringType, DoubleType
spark = SparkSession.builder \
.appName("Spark-Kafka-Integration") \
.master("local[2]") \
.getOrCreate()
dsraw = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "kafka:9092") \
.option("subscribe", "quickstart-events") \
.load()
ds = dsraw.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
print(type(ds))
rawQuery = dsraw \
.writeStream \
.queryName("query1")\
.format("memory")\
.start()
raw = spark.sql("select * from query1")
raw.show() # empty output
rawQuery = ds \
.writeStream \
.queryName("query2")\
.format("memory")\
.start()
raw = spark.sql("select * from query2")
raw.show() # empty output
print("complete")
Output:
+---+-----+-----+---------+------+---------+-------------+
|key|value|topic|partition|offset|timestamp|timestampType|
+---+-----+-----+---------+------+---------+-------------+
+---+-----+-----+---------+------+---------+-------------+
+---+-----+
|key|value|
+---+-----+
+---+-----+
if you are learning and experimenting with kafka spark streaming then it is fine.
just use:
while (True):
time.sleep(5)
print("queryresult")
raw.show() # it will start printing the result
instead of
raw.show() # it will run only once that's why not printig the result.
DO NOT USE for Production code.
Better to write like:
spark = SparkSession.builder \
.appName("Spark-Kafka-Integration") \
.master("local[2]") \
.getOrCreate()
dsraw = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "kafka:9092") \
.option("subscribe", "quickstart-events") \
.load()
ds = dsraw.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
rawQuery = \
ds \
.writeStream \
.format("console") \
.outputMode("append") \
.start()
rawQuery.awaitTermination()
it will automatically print the result on the console.

Spark Structred Streaming Pyspark Sink Csv Does'nt Append

Write json to Kafka Topic and read json from kafka topic. Actually I subscribe topic and write console line by line. But I have to sink/write file csv. But I can't. I write csv one time but doesn't append.
You can see my code bellow.
Thank you!
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as func
spark = SparkSession.builder\
.config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0') \
.appName('kafka_stream_test')\
.getOrCreate()
ordersSchema = StructType() \
.add("a", StringType()) \
.add("b", StringType()) \
.add("c", StringType()) \
.add("d", StringType())\
.add("e", StringType())\
.add("f", StringType())
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "product-views") \
.load()\
df_query = df \
.selectExpr("cast(value as string)") \
.select(func.from_json(func.col("value").cast("string"),ordersSchema).alias("parsed"))\
.select("parsed.a","parsed.b","parsed.c","parsed.d","parsed.e","parsed.f")\
df = df_query \
.writeStream \
.format("csv")\
.trigger(processingTime = "5 seconds")\
.option("path", "/var/kafka_stream_test_out/")\
.option("checkpointLocation", "/user/kafka_stream_test_out/chk") \
.start()
df.awaitTermination()
Yes, because you need this extra option .option("format", "append") :
aa = df_query \
.writeStream \
.format("csv")\
.option("format", "append")\
.trigger(processingTime = "5 seconds")\
.option("path", "/var/kafka_stream_test_out/")\
.option("checkpointLocation", "/user/kafka_stream_test_out/chk") \
.outputMode("append") \
.start()

Spark Structural Streaming with Confluent Cloud Kafka connectivity issue

I am writing a Spark structured streaming application in PySpark to read data from Kafka in Confluent Cloud. The documentation for the spark readstream() function is too shallow and didn't specify much on the optional parameter part especially on the auth mechanism part. I am not sure what parameter goes wrong and crash the connectivity. Can anyone have experience in Spark help me to start this connection?
Required Parameter
> Consumer({'bootstrap.servers':
> 'cluster.gcp.confluent.cloud:9092',
> 'sasl.username':'xxx',
> 'sasl.password': 'xxx',
> 'sasl.mechanisms': 'PLAIN',
> 'security.protocol': 'SASL_SSL',
> 'group.id': 'python_example_group_1',
> 'auto.offset.reset': 'earliest' })
Here is my pyspark code:
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "cluster.gcp.confluent.cloud:9092") \
.option("subscribe", "test-topic") \
.option("kafka.sasl.mechanisms", "PLAIN")\
.option("kafka.security.protocol", "SASL_SSL")\
.option("kafka.sasl.username","xxx")\
.option("kafka.sasl.password", "xxx")\
.option("startingOffsets", "latest")\
.option("kafka.group.id", "python_example_group_1")\
.load()
display(df)
However, I keep getting an error:
kafkashaded.org.apache.kafka.common.KafkaException: Failed to
construct kafka consumer
DataBrick Notebook- for testing
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/4673082066872014/3543014086288496/1802788104169533/latest.html
Documentation
https://home.apache.org/~pwendell/spark-nightly/spark-branch-2.0-docs/latest/structured-streaming-kafka-integration.html
This error indicates that JAAS configuration is not visible to your Kafka consumer. To solve this issue include JASS based on the follow steps:
Step01: Create a file for below JAAS file : /home/jass/path
KafkaClient {
com.sun.security.auth.module.Krb5LoginModule required
useTicketCache=true
renewTicket=true
serviceName="kafka";
};
Step02: Call that JASS file path in spark-submit based on the below conf parameter .
--conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=/home/jass/path"
Full spark-submit command :
/usr/hdp/2.6.1.0-129/spark2/bin/spark-submit --packages com.databricks:spark-avro_2.11:3.2.0,org.apache.spark:spark-avro_2.11:2.4.0,org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 --conf spark.ui.port=4055 --files /home/jass/path,/home/bdpda/bdpda.headless.keytab --conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=/home/jass/path" --conf "spark.driver.extraJavaOptions=-Djava.security.auth.login.config=/home/jass/path" pysparkstructurestreaming.py
Pyspark Structured streaming sample code :
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.streaming import StreamingContext
import time
# Spark Streaming context :
spark = SparkSession.builder.appName('PythonStreamingDirectKafkaWordCount').getOrCreate()
sc = spark.sparkContext
ssc = StreamingContext(sc, 20)
# Kafka Topic Details :
KAFKA_TOPIC_NAME_CONS = "topic_name"
KAFKA_OUTPUT_TOPIC_NAME_CONS = "topic_to_hdfs"
KAFKA_BOOTSTRAP_SERVERS_CONS = 'kafka_server:9093'
# Creating readstream DataFrame :
df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS_CONS) \
.option("subscribe", KAFKA_TOPIC_NAME_CONS) \
.option("startingOffsets", "earliest") \
.option("kafka.security.protocol","SASL_SSL")\
.option("kafka.client.id" ,"Clinet_id")\
.option("kafka.sasl.kerberos.service.name","kafka")\
.option("kafka.ssl.truststore.location", "/home/path/kafka_trust.jks") \
.option("kafka.ssl.truststore.password", "password_rd") \
.option("kafka.sasl.kerberos.keytab","/home/path.keytab") \
.option("kafka.sasl.kerberos.principal","path") \
.load()
df1 = df.selectExpr( "CAST(value AS STRING)")
# Creating Writestream DataFrame :
df1.writeStream \
.option("path","target_directory") \
.format("csv") \
.option("checkpointLocation","chkpint_directory") \
.outputMode("append") \
.start()
ssc.awaitTermination()
We need to specified kafka.sasl.jaas.config to add the username and password for the Confluent Kafka SASL-SSL auth method. Its parameter looks a bit odd, but it's working.
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "pkc-43n10.us-central1.gcp.confluent.cloud:9092") \
.option("subscribe", "wallet_txn_log") \
.option("startingOffsets", "earliest") \
.option("kafka.security.protocol","SASL_SSL") \
.option("kafka.sasl.mechanism", "PLAIN") \
.option("kafka.sasl.jaas.config", """kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="xxx" password="xxx";""").load()
display(df)

Resources