how to write same stream with different dataframes in console format? - apache-spark

As I am new to spark structure streaming and facing issue on a simple scenario:
I am trying to write one stream with two different dataframes.
from pyspark.sql import functions as f
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "topic1") \
.option("failOnDataLoss", False)\
.option("startingOffsets", "earliest") \
.load()
data1 = df.filter(f.col('status') == 'true')
data2 = df.filter(f.col('status') == 'false')
data2 = data2.select(df.id,f.struct(df.col1, df.col2, df.col3).alias('value'))
data2 = data2.groupBy("id").agg(f.collect_set('value').alias('history'))
data1 = data1.writeStream.format("console").option("truncate", "False").trigger(processingTime='15 seconds').start()
data2 = data2.writeStream.format("console").option("truncate", "False").trigger(processingTime='15 seconds').start()
spark.streams.awaitAnyTermination()
I am getting below error for the same:
Traceback (most recent call last):
File "/home/adarshbajpai/Downloads/spark-2.4.4-bin-hadoop2.7/python/lib/pyspark.zip/pyspark /sql/utils.py", line 63, in deco
File "/home/adarshbajpai/Downloads/spark-2.4.4-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o186.start.
: org.apache.spark.sql.AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;;
Aggregate [customerid#93L], [customerid#93L, collect_set(hist_value#278, 0, 0) AS histleadstatus#284]
+- Project [customerid#93L, named_struct(islaststatus, islaststatus#46, statusid, statusid#43, status, statusname#187, createdOn, statusCreatedDate#59, updatedOn, statusUpdatedDate#60) AS hist_value#278]
+- Filter (islaststatus#46 = 0)
I think I should not use watermark as my streaming has no delay and any latency.
please suggest ! Thanks in advance.

Related

How to run a pyspark code that gets data from kafka and converts it to dataframe?

I am trying to get a kafka topic into spark dataframe so the code is following:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# Create a SparkSession
spark = SparkSession.builder \
.appName("KafkaStreamToDataFrame") \
.getOrCreate()
# Define the schema for the data in the Kafka stream
schema = StructType([
StructField("key", StringType()),
StructField("value", StringType())
])
# Read the data from the Kafka stream
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "kafka_host:9092") \
.option("subscribe", "ext_device-measurement_10121") \
.load() \
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
.select(from_json(col("value"), schema).alias("data")) \
.select("data.*")
# Start the stream and display the data in the DataFrame
query = df \
.writeStream \
.format("console") \
.start()
query.awaitTermination()
I'm trying to execute the code by using spark-submit:
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 dev_ev.py
Spark Version: 3.3.1
Getting the following error when trying to execute:
File "/home/avs/avnish_spark/dev_ev.py", line 21, in <module>
.option("subscribe", "ext_device-measurement_10121") \
File "/opt/avnish/spark-3.3.1-bin-hadoop3/python/lib/pyspark.zip/pyspark/sql/streaming.py", line 469, in load
File "/opt/avnish/spark-3.3.1-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1322, in __call__
File "/opt/avnish/spark-3.3.1-bin-hadoop3/python/lib/pyspark.zip/pyspark/sql/utils.py", line 190, in deco
File "/opt/avnish/spark-3.3.1-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o35.load.
: java.lang.NoClassDefFoundError: scala/$less$colon$less
at org.apache.spark.sql.kafka010.KafkaSourceProvider.org$apache$spark$sql$kafka010$KafkaSourceProvider$$validateStreamOptions(KafkaSourceProvider.scala:338)
at org.apache.spark.sql.kafka010.KafkaSourceProvider.sourceSchema(KafkaSourceProvider.scala:71)
at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:236)
at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:118)
at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:118)
at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:34)
at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:168)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:144)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.ClassNotFoundException: scala.$less$colon$less
at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 20 more
Not sure what's wrong, the kafka topic is reachable and is pushing json records.
Tried to download the jar file manually and save it in jars directory in the SPARK_HOME, then executed using:
spark-submit --jars $SPARK_HOME/jars/org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.1 dev_ev.py
I was expecting a dataframe to be displayed.
I added startingOffsets in your code and it worked for me without any exceptions.
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "test.topic") \
.option("startingOffsets", "latest") \
.load() \
.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
.select(from_json(col("value"), schema).alias("data")) \
.select("data.*")

Python Proxy Error when streaming XML files from Azure Event Hub using Databricks

I've got the below piece of code to retrieve XML files, extract some of the tags and save as CSV files. As tag values need to be saved as separate files I'm using foreachbatch method of df.writeStream; to extract and save them separately. See below, the environment/version, the code used and the error returned when executed on Azure Databricks.
Environment:
Databricks Runtime version: 10.4 LTS
Apache Spark 3.2.1,
Scala 2.12
Event hubs library from maven: com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.22
Code:
# Databricks notebook source
import lxml.etree as ET
import pyspark.sql.types as T
from os.path import dirname, join
# Define namespaces found in the xml files to pick elements from the default ("") namespace or specific namespace
namespaces = {
"": "http://www.fpml.org/FpML-5/reporting",
"xsi": "http://www.w3.org/2001/XMLSchema-instance"
}
# trade date **********************************
trade_header = T.StructType([
T.StructField("messageId", T.StringType(), False),
T.StructField("tradeDate", T.StringType(), False)
])
def to_xml_message_trade_date(xml_string):
root = ET.fromstring(xml_string)
messageId = root.find(".//messageId", namespaces).text
tradeDate = root.find(".//tradeDate", namespaces).text
return [messageId, tradeDate]
extract_udf = udf(to_xml_message_trade_date, trade_header)
**********************************************
connectionString = "Endpoint=sb://xxxxxx.servicebus.windows.net/;SharedAccessKeyName=xxxx;SharedAccessKey=xxxxxxx;EntityPath=xxxxx"
ehConf = {
'eventhubs.connectionString' : sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(connectionString)
}
stream_data = spark \
.readStream \
.format("eventhubs") \
.options(**ehConf) \
.option('multiLine', True) \
.option('mode', 'PERMISSIVE') \
.load()
df_str = stream_data.withColumn("data", stream_data["body"].cast("string"))
def write2csv(df, epoch_id):
df.persist()
df_tuples = df.select(extract_udf("data").alias("extracted_data"))
df_parsed = df_tuples.select("extracted_data.*")
df_parsed \
.write \
.format("csv") \
.mode(SaveMode.Append) \
.option("header", True) \
.save("dbfs:/FileStore/Incoming/trade_date/")
df.unpersist()
query = df_str \
.writeStream \
.outputMode("append") \
.foreachBatch(write2csv) \
.trigger(processingTime="1 seconds") \
.start()
query.awaitTermination()
Error returned:
StreamingQueryException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
StreamingQueryException Traceback (most recent call last)
<command-1879221600357983> in <module>
6 .start()
7
----> 8 query.awaitTermination()
9
10 # .format("csv") \
/databricks/spark/python/pyspark/sql/streaming.py in awaitTermination(self, timeout)
101 return self._jsq.awaitTermination(int(timeout * 1000))
102 else:
--> 103 return self._jsq.awaitTermination()
104
105 #property
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
I can normally stream and save tag values in a single file using the below code snippet but the issue occurs when I use foreachbatch to save tag values in separate files.
df_trade_date \
.writeStream \
.format("csv") \
.trigger(processingTime="30 seconds") \
.option("checkpointLocation", "dbfs:/FileStore/checkpoint/") \
.option("path", "dbfs:/FileStore/Incoming/trade_date/") \
.option("header", True) \
.outputMode("append") \
.start() \
.awaitTermination()
What am I missing here? Are there any suggestions?
Changing write2csv function with below fixed the issue
def write2csv(df, epoch_id):
df.persist()
df_tuples = df.select(extract_udf("data").alias("extracted_data"))
df_parsed = df_tuples.select("extracted_data.*")
df_parsed \
.write \
.format("csv") \
.mode("append") \
.option("header", True) \
.save("dbfs:/FileStore/Incoming/trade_date/")
df.unpersist()
Note .mode("append") \ line where I replaced Savemode.Append with

Attempting to write a pyspark function to connect to SQL Server with Databricks on Apache Spart

I am trying to write a function that I can use to connect to SQL Server from Databricks.
My attempt is as follows:
def readFromDb():
jdbcDF = (spark.read
.format("jdbc")
.option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")
.option("url", "jdbc:sqlserver://mysqlserver.database.windows.net;database=mydatabase")
.option("user", 'myusername')
.option("query", 'query')
.option("password", 'myquery')
.load()
)
But I keep on getting the error:
com.microsoft.sqlserver.jdbc.SQLServerException: Incorrect syntax near ')'.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-2138842992712231> in <module>
----> 1 readFromDb()
<command-2138842992712230> in readFromDb()
1 def readFromDb():
----> 2 jdbcDF = (spark.read
Any thoughts on the error?
Replace the use and password options to the actual values of your DB and the query option to the query you want to run.
also don't forget to add return at the end of the function.
e.g:
.option("query", 'select * from information_schema.tables')

Unable to send Pyspark data frame to Kafka topic

I am trying to send data from a daily batch to a Kafka topic using pyspark, but I currently receive the following error:
Traceback (most recent call last): File "", line 5, in
File
"/usr/local/rms/lib/hdp26_c5000/spark2/python/pyspark/sql/readwriter.py",
line 548, in save
self._jwrite.save() File "/usr/local/rms/lib/hdp26_c5000/spark2/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py",
line 1133, in call File
"/usr/local/rms/lib/hdp26_c5000/spark2/python/pyspark/sql/utils.py",
line 71, in deco
raise AnalysisException(s.split(': ', 1)[1], stackTrace) pyspark.sql.utils.AnalysisException: u"Invalid call to toAttribute on
unresolved object, tree: unresolvedalias('shop_id, None)"
The code I am using is as follows:
from pyspark.sql import SparkSession
from pyspark.sql import functions
spark = SparkSession \
.builder \
.appName("Python Spark SQL basic example") \
.config("spark.debug.maxToStringFields", 100000) \
.getOrCreate()
df = spark.sql('''select distinct shop_id, item_id
from sale.data
''')
df.selectExpr("shop_id", "item_id") \
.write \
.format("kafka") \
.option("kafka.bootstrap.servers", "myserver.local:443") \
.option("topic","test_topic_01") \
.save()
Currently used versions are:
-Spark 2.1.1.2.6.2.0-205
-Kafka Broker 0.11
Kafka expects that a key and a value is written into its topic. Although the key is not mandatory. It does that by looking at the names of the dataframe columns which should be "key" and "value".
In your query, you are only selecting the column "shop_id", so no key or value column is existing. The error message: "unresolvedalias('shop_id, None)" tells you that the column "shop_id" is selected as the key (as it is the first column), but nothing is interpreted as the mandatory value.
You can solve your issue by renaming the column to "value", something like:
df = spark.sql('''select distinct shop_id, item_id
from sale.data
''')
df.withColumn("value", col("shop_id").cast(StringType)) \
.write \
.format("kafka") \
.option("kafka.bootstrap.servers", "myserver.local:443") \
.option("topic","test_topic_01") \
.save()

How to connect to a secured Kafka cluster from Zeppelin ("Failed to construct kafka consumer")?

I am trying to read some data from a Kafka broker using structured streaming to display it in a Zeppelin note. I am using Spark 2.4.3, Scala 2.11, Python 2.7, Java 9 and Kafka 2.2 with SSL enabled hosted on Heroku, but get the StreamingQueryException: 'Failed to construct kafka consumer'.
I am using the following dependencies (set in the Spark interpreter settings):
org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.3
org.apache.spark:spark-streaming_2.11:2.4.3
org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.3
I have tried older and newer versions, but these should match Spark/Scala versions I am using.
I have successfully written and read from Kafka using simple Python producer and consumer.
The code I am using:
%pyspark
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
from pyspark.sql.functions import col, expr, when
schema = StructType().add("power", IntegerType()).add("colorR", IntegerType()).add("colorG",IntegerType()).add("colorB",IntegerType()).add("colorW",IntegerType())
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", brokers) \
.option("kafka.security.protocol", "SSL") \
.option("kafka.ssl.truststore.location", "/home/ubuntu/kafka/truststore.jks") \
.option("kafka.ssl.keystore.location", "/home/ubuntu/kafka/keystore.jks") \
.option("kafka.ssl.keystore.password", password) \
.option("kafka.ssl.truststore.password", password) \
.option("kafka.ssl.endpoint.identification.algorithm", "") \
.option("startingOffsets", "earliest") \
.option("subscribe", topic) \
.load()
schema = ArrayType(
StructType([StructField("power", IntegerType()),
StructField("colorR", IntegerType()),
StructField("colorG", IntegerType()),
StructField("colorB", IntegerType()),
StructField("colorW", IntegerType())]))
readDF = df.select( \
col("key").cast("string"),
from_json(col("value").cast("string"), schema))
query = readDF.writeStream.format("console").start()
query.awaitTermination()
And the error I get:
Fail to execute line 43: query.awaitTermination()
Traceback (most recent call last):
File "/tmp/zeppelin_pyspark-2171412221151055324.py", line 380, in <module>
exec(code, _zcUserQueryNameSpace)
File "<stdin>", line 43, in <module>
File "/home/ubuntu/spark/python/lib/pyspark.zip/pyspark/sql/streaming.py", line 103, in awaitTermination
return self._jsq.awaitTermination()
File "/home/ubuntu/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/ubuntu/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 75, in deco
raise StreamingQueryException(s.split(': ', 1)[1], stackTrace)
StreamingQueryException: u'Failed to construct kafka consumer\n=== Streaming Query ===\nIdentifier: [id = 2ee20c47-8293-469a-bc0b-ef71a1f118bc, runId = 72422290-090a-4b6d-bd66-088a5a534240]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nProject [cast(key#7 as string) AS key#22, jsontostructs(ArrayType(StructType(StructField(power,IntegerType,true), StructField(colorR,IntegerType,true), StructField(colorG,IntegerType,true), StructField(colorB,IntegerType,true), StructField(colorW,IntegerType,true)),true), cast(value#8 as string), Some(Etc/UTC)) AS jsontostructs(CAST(value AS STRING))#21]\n+- StreamingExecutionRelation KafkaV2[Subscribe[tanana-44614.lightbulb]], [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n'
When I use read and write instead of readStream and writeStream I do not get any errors, but nothing appears on the console when I send some data to Kafka.
What else should I try?
It looks like the Kafka Consumer cannot access ~/kafka/truststore.jks and hence the exception. Replace ~ with the fully-specified path (without the tilde) and the issue should go away.

Resources