How do I write to Kafka using pyspark? - apache-spark

I am trying to write to Kafka using PySpark.
I got stuck on stage zero:
[Stage 0:> (0 + 8) / 9]
Then I get a timeout error:
org.apache.kafka.common.errors.TimeoutException: Failed to update metadata after 60000 ms.
Code is:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages
org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 pyspark-shell'
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *
def main():
spark = SparkSession.builder.master("local").appName("Spark CSV Reader")
.getOrCreate();
dirpath = os.path.abspath(sys.argv[1])
os.chdir(dirpath)
mySchema = StructType([
StructField("id", IntegerType()),StructField("name", StringType()),\
StructField("year", IntegerType()),StructField("rating", DoubleType()),\
StructField("duration", IntegerType()) ])
streamingDataFrame = spark.readStream.schema(mySchema)
.csv('file://' + dirpath + "/" )
streamingDataFrame.selectExpr("CAST(id AS STRING) AS key",
"to_json(struct(*)) AS value").\
writeStream.format("kafka").option("topic", "topicName")\
.option("kafka.bootstrap.servers", "localhost:9092")\
.option("checkpointLocation", "./chkpt").start()
I am running HDP 2.6.

As I mentioned in the comments, Spark runs on multiple machines, and it is highly unlikely that all these machines will be Kafka brokers.
Use the external address(es) for the Kafka cluster
.option("kafka.bootstrap.servers", "<kafka-broker-1>:9092,<kafka-broker-2>:9092")\

Related

PySpark Structured Streaming Query - query in dashbord visibility

I wrote some example code which connect to kafka broker, read data from topic and sink it to snappydata table.
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SQLContext, Row, SparkSession
from pyspark.sql.snappy import SnappySession
from pyspark.rdd import RDD
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col, explode, split
import time
import sys
def main(snappy):
logger = logging.getLogger('py4j')
logger.info("My test info statement")
sns = snappy.newSession()
df = sns \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "10.0.0.4:9092") \
.option("subscribe", "test_import3") \
.option("failOnDataLoss", "false") \
.option("startingOffsets", "latest") \
.load()
bdf = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
streamingQuery = bdf\
.writeStream\
.format("snappysink") \
.queryName("Devices3") \
.trigger(processingTime="30 seconds") \
.option("tablename","devices2") \
.option("checkpointLocation","/tmp") \
.start()
streamingQuery.awaitTermination()
if __name__ == "__main__":
from pyspark.sql.snappy import SnappySession
from pyspark import SparkContext, SparkConf
sc = SparkSession.builder.master("local[*]").appName("test").config("snappydata.connection", "10.0.0.4:1527").getOrCreate()
snc = SnappySession(sc)
main(snc)
I`m submitting it with command
/opt/snappydata/bin/spark-submit --master spark://10.0.0.4:1527 /path_to/file.py --conf snappydata.connection=10.0.0.4:1527
Everything works, data is readed from Kafka Topic and writed in snappydata table.
I don't understand why i don't see this streaming query in the SnappyData dashboard UI - after submitting pyspark code in the console i saw new Spark Master UI its started.
How can i connect to SnappyData internal Spark Master from pySpark it is possible?
SnappyData supports Python jobs to be submitted only in Smart Connector mode, which means it'll always be launched via a separate Spark Cluster to talk to SnappyData cluster. Hence, you see that your Python job is seen on this Spark cluster's UI and not on SnappyData's dashboard.

Spark : writeStream' can be called only on streaming Dataset/DataFrame

I'm trying to retrieve tweets from my Kafka cluster to Spark Streaming in which I perform some analysis to store them in an ElasticSearch Index.
Versions :
Spark - 2.3.0
Pyspark - 2.3.0
Kafka - 2.3.0
Elastic Search - 7.9
Elastic Search Hadoop - 7.6.2
I run the following code in my Jupyter env to write the streaming dataframe into Elastic Search .
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.0,org.elasticsearch:elasticsearch-hadoop:7.6.2 pyspark-shell'
from pyspark import SparkContext
# Spark Streaming
from pyspark.streaming import StreamingContext
# Kafka
from pyspark.streaming.kafka import KafkaUtils
# json parsing
import json
import nltk
import logging
from datetime import datetime
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def getSqlContextInstance(sparkContext):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
def analyze_sentiment(tweet):
scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])
sentiment_analyzer = SentimentIntensityAnalyzer()
score = sentiment_analyzer.polarity_scores(tweet)
for k in sorted(score):
scores[k] += score[k]
return json.dumps(scores)
def process(time,rdd):
print("========= %s =========" % str(time))
try:
if rdd.count()==0:
raise Exception('Empty')
sqlContext = getSqlContextInstance(rdd.context)
df = sqlContext.read.json(rdd)
df = df.filter("text not like 'RT #%'")
if df.count() == 0:
raise Exception('Empty')
udf_func = udf(lambda x: analyze_sentiment(x),returnType=StringType())
df = df.withColumn("Sentiment",lit(udf_func(df.text)))
print(df.take(10))
df.writeStream.outputMode('append').format('org.elasticsearch.spark.sql').option('es.nodes','localhost').option('es.port',9200)\
.option('checkpointLocation','/checkpoint').option('es.spark.sql.streaming.sink.log.enabled',False).start('PythonSparkStreamingKafka_RM_01').awaitTermination()
except Exception as e:
print(e)
pass
sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01")
sc.setLogLevel("INFO")
ssc = StreamingContext(sc, 20)
kafkaStream = KafkaUtils.createDirectStream(ssc, ['kafkaspark'], {
'bootstrap.servers':'localhost:9092',
'group.id':'spark-streaming',
'fetch.message.max.bytes':'15728640',
'auto.offset.reset':'largest'})
parsed = kafkaStream.map(lambda v: json.loads(v[1]))
parsed.foreachRDD(process)
ssc.start()
ssc.awaitTermination(timeout=180)
But I get the error :
'writeStream' can be called only on streaming Dataset/DataFrame;
And , it looks like I have to use .readStream , but how do I use it to read from KafkaStream without CreateDirectStream ?
Could someone please help me with writing this dataframe into Elastic Search . I am a beginner to Spark Streaming and Elastic Search and find it quite challenging . Would be happy if someone could guide me through getting this done.
.writeStream is a part of the Spark Structured Streaming API, so you need to use corresponding API to start reading the data - the spark.readStream, and pass options specific for the Kafka source that are described in the separate document, and also use the additional jar that contains the Kafka implementation. The corresponding code would look like that (full code is here):
val streamingInputDF = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.0.10:9092")
.option("subscribe", "tweets-txt")
.load()

Module not found error when importing Pyspark Delta Lake module

I'm running Pyspark with delta lake but when I try to import the delta module I get a ModuleNotFoundError: No module named 'delta'. This is on a machine without an internet connection so I had to download the delta-core jar manually from Maven and place it into the %SPARK_HOME%/jars folder.
My program works without any issues and I'm able to write and read from delta lake so I'm happy I've got the correct jar. But when I try and import the delta module from delta.tables import * I get the error.
For information my code is:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import TimestampType, FloatType, StructType, StructField
from pyspark.sql.functions import input_file_name
from Constants import Constants
if __name__ == "__main__":
constants = Constants()
spark = SparkSession.builder.master("local[*]")\
.appName("Delta Lake Testing")\
.getOrCreate()
# have to start spark session before importing: https://docs.delta.io/latest/quick-start.html#python
from delta.tables import *
# set logging level to limit output
spark.sparkContext.setLogLevel("ERROR")
spark.conf.set("spark.sql.session.timeZone", "UTC")
# push additional python files to the worker nodes
base_path = os.path.abspath(os.path.dirname(__file__))
spark.sparkContext.addPyFile(os.path.join(base_path, 'Constants.py'))
# start pipeline
schema = StructType([StructField("Timestamp", TimestampType(), False),\
StructField("ParamOne", FloatType(), False),\
StructField("ParamTwo", FloatType(), False),\
StructField("ParamThree", FloatType(), False)])
df = spark.readStream\
.option("header", "true")\
.option("timestampFormat", "yyyy-MM-dd HH:mm:ss")\
.schema(schema)\
.csv(constants.input_path)\
.withColumn("input_file_name", input_file_name())
df.writeStream\
.format("delta")\
.outputMode("append")\
.option("checkpointLocation", constants.checkpoint_location)\
.start("/tmp/bronze")
# await on stream
sqm = spark.streams
sqm.awaitAnyTermination()
This is using Spark v2.4.4 and Python v3.6.1 and the job is submitted using spark-submit path/to/job.py
%pyspark
sc.addPyFile("**LOCATION_OF_DELTA_LAKE_JAR_FILE**")
from delta.tables import *

Failed to find leader for topics; java.lang.NullPointerException NullPointerException at org.apache.kafka.common.utils.Utils.formatAddress

When we are trying to stream the data from SSL enabled Kafka topic we are facing below error . Can you please help us on this issue .
19/11/07 13:26:54 INFO ConsumerFetcherManager: [ConsumerFetcherManager-1573151189884] Added fetcher for partitions ArrayBuffer()
19/11/07 13:26:54 WARN ConsumerFetcherManager$LeaderFinderThread: [spark-streaming-consumer_dvtcbddc101.corp.cox.com-1573151189725-d40a510f-leader-finder-thread], Failed to find leader for Set([inst_monitor_status_test,2], [inst_monitor_status_test,0], [inst_monitor_status_test,1])
java.lang.NullPointerException
at org.apache.kafka.common.utils.Utils.formatAddress(Utils.java:408)
at kafka.cluster.Broker.connectionString(Broker.scala:62)
at kafka.client.ClientUtils$$anonfun$fetchTopicMetadata$5.apply(ClientUtils.scala:89)
at kafka.client.ClientUtils$$anonfun$fetchTopicMetadata$5.apply(ClientUtils.scala:89)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at kafka.client.ClientUtils$.fetchTopicMetadata(ClientUtils.scala:89)
at kafka.consumer.ConsumerFetcherManager$LeaderFinderThread.doWork(ConsumerFetcherManager.scala:66)
at kafka.utils.ShutdownableThread.run(ShutdownableThread.scala:60)
Pyspark code :
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark import SparkConf, SparkContext
from operator import add
import sys
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
from kafka import SimpleProducer, KafkaClient
from kafka import KafkaProducer
def handler(message):
records = message.collect()
for record in records:
print(record)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonStreamingKafkaWordCount")
ssc = StreamingContext(sc, 10)
zkQuorum, topic = sys.argv[1:]
kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
counts.pprint()
kvs.foreachRDD(handler)
ssc.start()
ssc.awaitTermination()
Spark submit command :
Spark submit:
/usr/hdp/2.6.1.0-129/spark2/bin/spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0,org.apache.spark:spark-sql-kafka-0-10_2.11:2.1.0,org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0 dsstream2.py host:2181 inst_monitor_status_test
Thanks for your inputs . I have passed the SSL parameters in following method and working fine as expected.
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.streaming import StreamingContext
import time
# Spark Streaming context :
spark = SparkSession.builder.appName('PythonStreamingDirectKafkaWordCount').getOrCreate()
sc = spark.sparkContext
ssc = StreamingContext(sc, 20)
# Kafka Topic Details :
KAFKA_TOPIC_NAME_CONS = "topic_name"
KAFKA_OUTPUT_TOPIC_NAME_CONS = "topic_to_hdfs"
KAFKA_BOOTSTRAP_SERVERS_CONS = 'kafka_server:9093'
# Creating readstream DataFrame :
df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS_CONS) \
.option("subscribe", KAFKA_TOPIC_NAME_CONS) \
.option("startingOffsets", "earliest") \
.option("kafka.security.protocol","SASL_SSL")\
.option("kafka.client.id" ,"Clinet_id")\
.option("kafka.sasl.kerberos.service.name","kafka")\
.option("kafka.ssl.truststore.location", "/home/path/kafka_trust.jks") \
.option("kafka.ssl.truststore.password", "password_rd") \
.option("kafka.sasl.kerberos.keytab","/home/path.keytab") \
.option("kafka.sasl.kerberos.principal","path") \
.load()
df1 = df.selectExpr( "CAST(value AS STRING)")
# Creating Writestream DataFrame :
df1.writeStream \
.option("path","target_directory") \
.format("csv") \
.option("checkpointLocation","chkpint_directory") \
.outputMode("append") \
.start()
ssc.awaitTermination()

why spark python udf execution time 10x difference on different partition strategy?

I got huge (over 10x~100x) execution time difference between 2 jobs with only difference on partition strategy, wanting to know why :)
Observation:
repartition by partition number with equalized record runs 10~100x slower than 2.
repartition by column: phone_country_code
from spark history, only difference are 1. got minor larger(10~20%) shuffle read size.
My environment:
Spark 1.6.1 on EMR 4.7
Python 2.7
submit job using pyspark
Spark Job:
python udf to parse phone number for time zone info
read data from redshift via spark-redshift and write back
code sample:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import DateType, TimestampType, StringType
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, udf
conf = SparkConf().setAppName("extract_local_time")
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)
sc.addPyFile("s3://xxx/xxx.zip")
def local_time(phone_number, datetime_org):
from util import phonenumber_util
local_time = phonenumber_util.convert_to_local_datetime_by_phone_number(
phone_number,
datetime_org)
return local_time.replace(tzinfo=None)
local_time_func = udf(local_time, TimestampType())
df = sql_context.read \
.format("com.databricks.spark.redshift") \
.option("url", "jdbc:redshift://xxx") \
.option("query", "select * from xxx") \
.option("tempdir", "s3n://xxx") \
.load()
# df = df.repartition(12*10) # partition strategy 1
df = df.repartition('phone_country_code') # partition strategy 2
df2 = df.withColumn("datetime_local", local_time_func(col("phone_number"), col("datetime")))
df2.registerTempTable("xxx")
sql_context.sql("SELECT * FROM xxx") \
.write.format("com.databricks.spark.redshift") \
.option("url", "jdbc:redshift://xxx") \
.option("tempdir", "s3n://xxx") \
.option("dbtable", "xxx") \
.mode("overwrite") \
.save()
data sample:
phone_number, phone_country_code
55-82981399971, 55
1-7073492922, 1
90-5395889859, 90
My guess:
some optimization on jvm-py level on udf that depends on partitions's record distribution?
Thanks for any further suggestions :)

Resources