Why pyspark cannot show any data? - apache-spark

when I use Windows local spark like below, it work and Can see "df.count()"
spark = SparkSession \
.builder \
.appName("Structured Streaming ") \
.master("local[*]") \
.getOrCreate()
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
.option("subscribe", kafka_topic_name) \
.option("startingOffsets", "latest") \
.load()
flower_df1 = df.selectExpr("CAST(value AS STRING)", "timestamp")
flower_schema_string = "sepal_length DOUBLE,sepal_length DOUBLE,sepal_length DOUBLE,sepal_length DOUBLE,species STRING"
flower_df2 = flower_df1.select(from_csv(col("value"), flower_schema_string).alias("flower"), "timestamp").select("flower.*", "timestamp")
flower_df2.createOrReplaceTempView("flower_find")
song_find_text = spark.sql("SELECT * FROM flower_find")
flower_agg_write_stream = song_find_text \
.writeStream \
.option("truncate", "false") \
.format("memory") \
.outputMode("update") \
.queryName("testedTable") \
.start()
while True:
df = spark.sql("SELECT * FROM testedTable")
print(df.count())
time.sleep(1)
But when I use my Virtual Box's Ubuntu's Spark, NEVER SEE any data.
below is the modification I made when I using Ubuntu's Spark.
SparkSession's master URL: "spark://192.168.15.2:7077"
Insert code flower_agg_write_stream.awaitTermination() above "while True:"
Did I do something wrong?
ADD.
when run modification code, log appears as below:
...
org.apache.spark.sql.AnalysisException: Table or view not found: testedTable;
...
unfortunately, I already try createOrReplaceGlobalTempView(). but it doesn't work too.

Related

Reading from Azure Event hub with Kafka driver doesn't seem to get any data

I'm running the following code in an Azure Databricks python notebook:
TOPIC = "myeventhub"
BOOTSTRAP_SERVERS = "myeventhubns.servicebus.windows.net:9093"
EH_SASL = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"Endpoint=sb://myeventhubns.servicebus.windows.net/;SharedAccessKeyName=MyKeyName;SharedAccessKey=myaccesskey;\";"
df = spark.readStream \
.format("kafka") \
.option("subscribe", TOPIC) \
.option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
.option("kafka.sasl.mechanism", "PLAIN") \
.option("kafka.security.protocol", "SASL_SSL") \
.option("kafka.sasl.jaas.config", EH_SASL) \
.option("kafka.request.timeout.ms", "60000") \
.option("kafka.session.timeout.ms", "60000") \
.option("failOnDataLoss", "false") \
.option("startingOffsets", "earliest") \
.load()
df_write = df.writeStream \
.outputMode("append") \
.format("console") \
.start() \
.awaitTermination()
This shows no output in the notebook. How could I debug what the problem is?
If you use .format("console") then output won't be in the notebook, it will be in the driver & executor logs - it's a difference between Spark and Databricks.
If you want to see the data, just use the display function:
display(df)
This code is now writing data with quite low latency. Newest datapoint is around 10 seconds old when I do a select in a sql warehouse. The problem still is that foreachBatch is not run, but otherwise it's working.
TOPIC = "myeventhub"
BOOTSTRAP_SERVERS = "myeventhub.servicebus.windows.net:9093"
EH_SASL = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"Endpoint=sb://myeventhub.servicebus.windows.net/;SharedAccessKeyName=mykeyname;SharedAccessKey=mykey;EntityPath=myentitypath;\";"
df = spark.readStream \
.format("kafka") \
.option("subscribe", TOPIC) \
.option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
.option("kafka.sasl.mechanism", "PLAIN") \
.option("kafka.security.protocol", "SASL_SSL") \
.option("kafka.sasl.jaas.config", EH_SASL) \
.option("kafka.request.timeout.ms", "60000") \
.option("kafka.session.timeout.ms", "60000") \
.option("failOnDataLoss", "false") \
.option("startingOffsets", "earliest") \
.load()
n = 100
count = 0
def run_command(batchDF, epoch_id):
global count
count += 1
if count % n == 0:
spark.sql("OPTIMIZE firstcatalog.bronze.factorydatas3 ZORDER BY (readtimestamp)")
...Omitted code where I transform the data in the value column to strongly typed data...
myTypedDF.writeStream \
.foreachBatch(run_command) \
.format("delta") \
.outputMode("append") \
.option("checkpointLocation", "/tmp/delta/events/_checkpoints/") \
.partitionBy("somecolumn") \
.toTable("myunitycatalog.bronze.mytable")

Spark streaming: get the max values

Hi I am triying to get the most repeated values from a stream data.
In order to do this I have the following code:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import regexp_extract, col
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 pyspark-shell'
spark = SparkSession \
.builder \
.appName("SSKafka") \
.getOrCreate()
df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", 'localhost:9092') \
.option("subscribe", 'twitter') \
.option("startingTimestamp", 1000) \
.option("startingOffsets", "earliest") \
.load()
ds = df \
.selectExpr("CAST(value AS STRING)", "timestamp") \
.select(regexp_extract(col('value'), '#(\w+)', 1).alias('hashtags'), 'timestamp')
df_group = ds.withWatermark("timestamp", "5 seconds") \
.groupBy(
'timestamp',
'hashtags'
).agg(
F.count(col('hashtags')).alias('total')
)
query = df_group \
.writeStream \
.outputMode("append") \
.format("console") \
.option("truncate", "False") \
.start()
query.awaitTermination()
The idea is to process a batch of 5 seconds, and show when each batch is processed the current top hashtags most used.
The main idea was using this code without group by timestamp, but I got an error, about that if ds doesn't use timestamp then df_group doesn't use outputMode("append"), and I want to show the update.
Is this possible, how can I do it?
Thanks.

Kafka and pyspark program: Unable to determine why dataframe is empty

Below is my first program working with kafka and pyspark. The code seems to run without exceptions, but the output of my query is empty.
I am initiating spark and kafka. Later, in Kafka initiation, I subscribed the topic = "quickstart-events" and from terminal produced messages for this topic. But when I run this code, it gives me blank dataframes.
How do I resolve?
Code:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession, DataFrame
from pyspark.sql.types import StructType, ArrayType, StructField, IntegerType, StringType, DoubleType
spark = SparkSession.builder \
.appName("Spark-Kafka-Integration") \
.master("local[2]") \
.getOrCreate()
dsraw = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "kafka:9092") \
.option("subscribe", "quickstart-events") \
.load()
ds = dsraw.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
print(type(ds))
rawQuery = dsraw \
.writeStream \
.queryName("query1")\
.format("memory")\
.start()
raw = spark.sql("select * from query1")
raw.show() # empty output
rawQuery = ds \
.writeStream \
.queryName("query2")\
.format("memory")\
.start()
raw = spark.sql("select * from query2")
raw.show() # empty output
print("complete")
Output:
+---+-----+-----+---------+------+---------+-------------+
|key|value|topic|partition|offset|timestamp|timestampType|
+---+-----+-----+---------+------+---------+-------------+
+---+-----+-----+---------+------+---------+-------------+
+---+-----+
|key|value|
+---+-----+
+---+-----+
if you are learning and experimenting with kafka spark streaming then it is fine.
just use:
while (True):
time.sleep(5)
print("queryresult")
raw.show() # it will start printing the result
instead of
raw.show() # it will run only once that's why not printig the result.
DO NOT USE for Production code.
Better to write like:
spark = SparkSession.builder \
.appName("Spark-Kafka-Integration") \
.master("local[2]") \
.getOrCreate()
dsraw = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "kafka:9092") \
.option("subscribe", "quickstart-events") \
.load()
ds = dsraw.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
rawQuery = \
ds \
.writeStream \
.format("console") \
.outputMode("append") \
.start()
rawQuery.awaitTermination()
it will automatically print the result on the console.

Spark Structred Streaming Pyspark Sink Csv Does'nt Append

Write json to Kafka Topic and read json from kafka topic. Actually I subscribe topic and write console line by line. But I have to sink/write file csv. But I can't. I write csv one time but doesn't append.
You can see my code bellow.
Thank you!
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as func
spark = SparkSession.builder\
.config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0') \
.appName('kafka_stream_test')\
.getOrCreate()
ordersSchema = StructType() \
.add("a", StringType()) \
.add("b", StringType()) \
.add("c", StringType()) \
.add("d", StringType())\
.add("e", StringType())\
.add("f", StringType())
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "product-views") \
.load()\
df_query = df \
.selectExpr("cast(value as string)") \
.select(func.from_json(func.col("value").cast("string"),ordersSchema).alias("parsed"))\
.select("parsed.a","parsed.b","parsed.c","parsed.d","parsed.e","parsed.f")\
df = df_query \
.writeStream \
.format("csv")\
.trigger(processingTime = "5 seconds")\
.option("path", "/var/kafka_stream_test_out/")\
.option("checkpointLocation", "/user/kafka_stream_test_out/chk") \
.start()
df.awaitTermination()
Yes, because you need this extra option .option("format", "append") :
aa = df_query \
.writeStream \
.format("csv")\
.option("format", "append")\
.trigger(processingTime = "5 seconds")\
.option("path", "/var/kafka_stream_test_out/")\
.option("checkpointLocation", "/user/kafka_stream_test_out/chk") \
.outputMode("append") \
.start()

Spark Structural Streaming with Confluent Cloud Kafka connectivity issue

I am writing a Spark structured streaming application in PySpark to read data from Kafka in Confluent Cloud. The documentation for the spark readstream() function is too shallow and didn't specify much on the optional parameter part especially on the auth mechanism part. I am not sure what parameter goes wrong and crash the connectivity. Can anyone have experience in Spark help me to start this connection?
Required Parameter
> Consumer({'bootstrap.servers':
> 'cluster.gcp.confluent.cloud:9092',
> 'sasl.username':'xxx',
> 'sasl.password': 'xxx',
> 'sasl.mechanisms': 'PLAIN',
> 'security.protocol': 'SASL_SSL',
> 'group.id': 'python_example_group_1',
> 'auto.offset.reset': 'earliest' })
Here is my pyspark code:
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "cluster.gcp.confluent.cloud:9092") \
.option("subscribe", "test-topic") \
.option("kafka.sasl.mechanisms", "PLAIN")\
.option("kafka.security.protocol", "SASL_SSL")\
.option("kafka.sasl.username","xxx")\
.option("kafka.sasl.password", "xxx")\
.option("startingOffsets", "latest")\
.option("kafka.group.id", "python_example_group_1")\
.load()
display(df)
However, I keep getting an error:
kafkashaded.org.apache.kafka.common.KafkaException: Failed to
construct kafka consumer
DataBrick Notebook- for testing
https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/4673082066872014/3543014086288496/1802788104169533/latest.html
Documentation
https://home.apache.org/~pwendell/spark-nightly/spark-branch-2.0-docs/latest/structured-streaming-kafka-integration.html
This error indicates that JAAS configuration is not visible to your Kafka consumer. To solve this issue include JASS based on the follow steps:
Step01: Create a file for below JAAS file : /home/jass/path
KafkaClient {
com.sun.security.auth.module.Krb5LoginModule required
useTicketCache=true
renewTicket=true
serviceName="kafka";
};
Step02: Call that JASS file path in spark-submit based on the below conf parameter .
--conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=/home/jass/path"
Full spark-submit command :
/usr/hdp/2.6.1.0-129/spark2/bin/spark-submit --packages com.databricks:spark-avro_2.11:3.2.0,org.apache.spark:spark-avro_2.11:2.4.0,org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 --conf spark.ui.port=4055 --files /home/jass/path,/home/bdpda/bdpda.headless.keytab --conf "spark.executor.extraJavaOptions=-Djava.security.auth.login.config=/home/jass/path" --conf "spark.driver.extraJavaOptions=-Djava.security.auth.login.config=/home/jass/path" pysparkstructurestreaming.py
Pyspark Structured streaming sample code :
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.streaming import StreamingContext
import time
# Spark Streaming context :
spark = SparkSession.builder.appName('PythonStreamingDirectKafkaWordCount').getOrCreate()
sc = spark.sparkContext
ssc = StreamingContext(sc, 20)
# Kafka Topic Details :
KAFKA_TOPIC_NAME_CONS = "topic_name"
KAFKA_OUTPUT_TOPIC_NAME_CONS = "topic_to_hdfs"
KAFKA_BOOTSTRAP_SERVERS_CONS = 'kafka_server:9093'
# Creating readstream DataFrame :
df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS_CONS) \
.option("subscribe", KAFKA_TOPIC_NAME_CONS) \
.option("startingOffsets", "earliest") \
.option("kafka.security.protocol","SASL_SSL")\
.option("kafka.client.id" ,"Clinet_id")\
.option("kafka.sasl.kerberos.service.name","kafka")\
.option("kafka.ssl.truststore.location", "/home/path/kafka_trust.jks") \
.option("kafka.ssl.truststore.password", "password_rd") \
.option("kafka.sasl.kerberos.keytab","/home/path.keytab") \
.option("kafka.sasl.kerberos.principal","path") \
.load()
df1 = df.selectExpr( "CAST(value AS STRING)")
# Creating Writestream DataFrame :
df1.writeStream \
.option("path","target_directory") \
.format("csv") \
.option("checkpointLocation","chkpint_directory") \
.outputMode("append") \
.start()
ssc.awaitTermination()
We need to specified kafka.sasl.jaas.config to add the username and password for the Confluent Kafka SASL-SSL auth method. Its parameter looks a bit odd, but it's working.
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "pkc-43n10.us-central1.gcp.confluent.cloud:9092") \
.option("subscribe", "wallet_txn_log") \
.option("startingOffsets", "earliest") \
.option("kafka.security.protocol","SASL_SSL") \
.option("kafka.sasl.mechanism", "PLAIN") \
.option("kafka.sasl.jaas.config", """kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="xxx" password="xxx";""").load()
display(df)

Resources