Spark Streaming: Custom Receiver kryo registration with Google Pubsub - apache-spark

I'm using Spark 2.0.2 with Kryo serialization.
I'm attempting to implement a custom receiver for ingesting messages from Google PubSub into Spark Streaming:
class PubSubReceiver(project: String, topic: String, subscription: String)
extends Receiver[Array[Byte]](StorageLevel.MEMORY_AND_DISK_2) with Logging {
val projectFullName = ProjectName.create(project)
val topicName = TopicName.create(project, topic)
val subscriptionName = SubscriptionName.create(project, subscription)
val subscriber = Subscriber.defaultBuilder(subscriptionName, new receiver).build
def onStart() {
new Thread() {
override def run() {
subscriber.startAsync()
//ensure subscriber is running as well as spark receiver
while (subscriber.isRunning && !isStopped()) {
logger.info(s"${subscriber.getSubscriptionName} receiver running")
//sleep 10s
Thread.sleep(10000)
}
logger.info(s"${subscriber.getSubscriptionName} receiver stopping")
}
}.start()
}
def onStop(): Unit = {
// There is nothing much to do as the thread calling receive()
// is designed to stop by itself if isStopped() returns false
}
private class receiver extends MessageReceiver {
override def receiveMessage(message: PubsubMessage, consumer: AckReplyConsumer): Unit = {
store(ArrayBuffer(message.getData.toByteArray), message.getAttributesMap)
}
}
}
However when running a Spark job that utilizes this receiver, it seems that I have to serialized the job itself, which doesnt seem correct (the spark context would then be serialized).
object PubSubStreamingIngestionJob extends App {
//... setup
lazy val ssc = new StreamingContext(spark.sparkContext, batchInterval)
lazy val pubsubUnionStream =the stream
ssc.receiverStream(new PubSubReceiver(projectName, topicName, subscriptionName))
pubsubUnionStream.map( messageBytes => ...business logic... )
ssc.start()
ssc.awaitTermination()
}
The following error is thrown:
java.io.IOException: com.esotericsoftware.kryo.KryoException: java.lang.IllegalArgumentException: Class is not registered: com.c2fo.atlas.jobs.streaming.gcp.PubSubStreamingIngestionJob
Note: To register this class use: kryo.register(com.mycompany.package.PubSubStreamingIngestionJob.class);
Serialization trace:
classes (sun.misc.Launcher$AppClassLoader)
contextClassLoader (java.lang.Thread)
threads (java.lang.ThreadGroup)
parent (java.lang.ThreadGroup)
group (java.util.concurrent.Executors$DefaultThreadFactory)
val$backingThreadFactory (com.google.common.util.concurrent.ThreadFactoryBuilder$1)
threadFactory (java.util.concurrent.ScheduledThreadPoolExecutor)
e (java.util.concurrent.Executors$DelegatedScheduledExecutorService)
executor (com.google.cloud.pubsub.spi.v1.Subscriber)
subscriber (com.mycompany.package.PubSubReceiver)
array (scala.collection.mutable.WrappedArray$ofRef)
Is there a better way of implementing this?

The issue was the Subscriber instance needed to be thread local to prevent the entire closure from being serialized.
package org.apache.spark.streaming.gcp
import com.c2fo.atlas.util.LazyLogging
import com.google.cloud.pubsub.spi.v1._
import com.google.iam.v1.ProjectName
import com.google.pubsub.v1._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
import scala.collection.mutable.ArrayBuffer
class PubSubReceiver(project: String, topic: String, subscription: String)
extends Receiver[PubsubMessage](StorageLevel.MEMORY_AND_DISK_2) with LazyLogging{
val projectFullName = ProjectName.create(project)
val topicName = TopicName.create(project, topic)
val subscriptionName = SubscriptionName.create(project, subscription)
def onStart() {
new Thread() {
**//crucial change below**
val subscriber = Subscriber.defaultBuilder(subscriptionName, new receiver).build
override def run() {
subscriber.startAsync()
//ensure subscriber is running as well as spark receiver
while (subscriber.isRunning && !isStopped()) {
logger.info(s"${subscriber.getSubscriptionName} receiver running")
//sleep 10s
Thread.sleep(10000)
}
logger.info(s"${subscriber.getSubscriptionName} receiver stopping")
}
}.start()
}
def onStop(): Unit = {
// There is nothing much to do as the thread calling receive()
// is designed to stop by itself if isStopped() returns false
}
class receiver extends MessageReceiver {
override def receiveMessage(message: PubsubMessage, consumer: AckReplyConsumer): Unit = {
store(ArrayBuffer(message), message.getAttributesMap)
}
}
}

Related

How to start a thread that is defined in a private inner class?

I found this example of how to read from a Bluetooth socket using a separate read thread on the Android developer website. The read thread is defined in the private inner class "ConnectedThread".
class MyBluetoothService(
// handler that gets info from Bluetooth service
private val handler: Handler) {
private inner class ConnectedThread(private val mmSocket: BluetoothSocket) : Thread() {
private val mmInStream: InputStream = mmSocket.inputStream
private val mmOutStream: OutputStream = mmSocket.outputStream
private val mmBuffer: ByteArray = ByteArray(1024) // mmBuffer store for the stream
override fun run() {
var numBytes: Int // bytes returned from read()
// Keep listening to the InputStream until an exception occurs.
while (true) {
// Read from the InputStream.
numBytes = try {
mmInStream.read(mmBuffer)
} catch (e: IOException) {
Log.d(TAG, "Input stream was disconnected", e)
break
}
// Send the obtained bytes to the UI activity.
val readMsg = handler.obtainMessage(
MESSAGE_READ, numBytes, -1,
mmBuffer)
readMsg.sendToTarget()
}
}
//Other functions like write, cancel that I omitted from this example
}
So I added a function in MyBluetoothService to start the read thread:
#JvmStatic
fun read(){
val reader = ConnectedThread(myBluetoothSocket)
Reader.start()
}
But this gives an immediate error:
Constructor of inner class ConnectedThread can be called only with
receiver of containing class
How should I start the thread from the example code?
Your ConnectedThread is an inner class of MyBluetoothService so it can't be instantiated outside an instance of MyBluetoothService.
Change it like this (remove private inner):
class ConnectedThread(private val mmSocket: BluetoothSocket) : Thread() {
You'll have to get access to the service some other way, or alternatively create a factory method in your service that instantiates the thread and returns that.

How to enable Kafka Producer Metrics in Spark?

We are using Kafka 0.10 with Spark 2.1 and I found our producer publish messages was always slow. I can only reach around 1k/s after give 8 cores to Spark executors while other post said they car reach millions/sec easily.
I tried to tune the linger.ms and batch.size to find out. However I found linger.ms=0 looks like optimal for me and the batch.size doesn't take much effect. And I was sending 160k events per iteration. Looks like I have to enable the Kafka Producer Metrics to know what exactly happen. But looks like it is not very easy to enable it in Spark Executor.
Could any one share me some lights?
My codes are like this:
private def publishMessagesAttempt(producer: KafkaProducer[String, String], topic: String, messages: Iterable[(String, String)], producerMaxDelay: Long,
individualMessageMaxDelay: Long, logger: (String, Boolean) => Unit = KafkaClusterUtils.DEFAULT_LOGGER): Iterable[(String, String)] = {
val futureMessages = messages.map(message => (message, producer.send(new ProducerRecord[String, String](topic, message._1, message._2))))
val messageSentTime = System.currentTimeMillis
val awaitedResults = futureMessages.map { case (message, future) =>
val waitFor = Math.max(producerMaxDelay - (System.currentTimeMillis - messageSentTime), individualMessageMaxDelay)
val failed = Try(future.get(waitFor, TimeUnit.MILLISECONDS)) match {
case Success(_) => false
case Failure(f) =>
logger(s"Error happened when publish to Kafka: ${f.getStackTraceString}", true)
true
}
(message, failed)
}
awaitedResults.filter(_._2).map(_._1)
}
I finally find the answer.
1. KafkaProducer has a metrics() function which can get the metrics of the producer. Just simply print it should be enough.
Some codes like this should work:
public class MetricsProducerReporter implements Runnable {
private final Producer<String, StockPrice> producer;
private final Logger logger =
LoggerFactory.getLogger(MetricsProducerReporter.class);
//Used to Filter just the metrics we want
private final Set<String> metricsNameFilter = Sets.set(
"record-queue-time-avg", "record-send-rate", "records-per-request-avg",
"request-size-max", "network-io-rate", "record-queue-time-avg",
"incoming-byte-rate", "batch-size-avg", "response-rate", "requests-in-flight"
);
public MetricsProducerReporter(
final Producer<String, StockPrice> producer) {
this.producer = producer;
}
#Override
public void run() {
while (true) {
final Map<MetricName, ? extends Metric> metrics
= producer.metrics();
displayMetrics(metrics);
try {
Thread.sleep(3_000);
} catch (InterruptedException e) {
logger.warn("metrics interrupted");
Thread.interrupted();
break;
}
}
}
My codes are slow was because the scala map doesn't have the parallel enabled by default. I will have to use messages.par.map() to achieve the parallelism.

Apache Spark can not read Kafka message Content

I am trying to create Apache Spark job to consume Kafka messages submitted in to a topic. To submit messages to the topic using kafka-console-producer as below.
./kafka-console-producer.sh --broker-list kafka1:9092 --topic my-own-topic
To read messages I am using spark-streaming-kafka-0-10_2.11 library. With the library manage to to read the total counts of the messages received to the topic. But I can not read ConsumerRecord object in the stream and when I try to read it entire application get blocked and can not print it in to the console. Note I am running Kafka, Zookeeper and Spark in docker containers. Help would be greatly appreciated.
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.HasOffsetRanges;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import org.apache.spark.streaming.kafka010.OffsetRange;
public class SparkKafkaStreamingJDBCExample {
public static void main(String[] args) {
// Start a spark instance and get a context
SparkConf conf =
new SparkConf().setAppName("Study Spark").setMaster("spark://spark-master:7077");
// Setup a streaming context.
JavaStreamingContext streamingContext = new JavaStreamingContext(conf, Durations.seconds(3));
// Create a map of Kafka params
Map<String, Object> kafkaParams = new HashMap<String, Object>();
// List of Kafka brokers to listen to.
kafkaParams.put("bootstrap.servers", "kafka1:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
// Do you want to start from the earliest record or the latest?
kafkaParams.put("auto.offset.reset", "earliest");
kafkaParams.put("enable.auto.commit", true);
// List of topics to listen to.
Collection<String> topics = Arrays.asList("my-own-topic");
// Create a Spark DStream with the kafka topics.
final JavaInputDStream<ConsumerRecord<String, String>> stream =
KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
System.out.println("Study Spark Example Starting ....");
stream.foreachRDD(rdd -> {
if (rdd.isEmpty()) {
System.out.println("RDD Empty " + rdd.count());
return;
} else {
System.out.println("RDD not empty " + rdd.count());
OffsetRange[] offsetRanges = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
System.out.println("Partition Id " + TaskContext.getPartitionId());
OffsetRange o = offsetRanges[TaskContext.getPartitionId()];
System.out.println("Topic " + o.topic());
System.out.println("Creating RDD !!!");
JavaRDD<ConsumerRecord<String, String>> r =
KafkaUtils.createRDD(streamingContext.sparkContext(), kafkaParams, offsetRanges,
LocationStrategies.PreferConsistent());
System.out.println("Count " + r.count());
//Application stuck from here onwards ...
ConsumerRecord<String, String> first = r.first();
System.out.println("First taken");
System.out.println("First value " + first.value());
}
});
System.out.println("Stream context starting ...");
// Start streaming.
streamingContext.start();
System.out.println("Stream context started ...");
try {
System.out.println("Stream context await termination ...");
streamingContext.awaitTermination();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
Sample output given below also.
Study Spark Example Starting ....
Stream context starting ...
Stream context started ...
Stream context await termination ...
RDD Empty 0
RDD Empty 0
RDD Empty 0
RDD Empty 0
RDD not empty 3
Partition Id 0
Topic my-own-topic
Creating RDD !!!

Spark Streaming: how not to restart receiver after receiver's failure

We are using a custom spark receiver that reads streamed data from a provided http link. If the provided http link is incorrect, the receiver fails. The problem is that spark will continuously restart the receiver, and the application will never terminate. The question is how to tell Spark to terminate the application if the receiver fails.
This is an extract of our custom receiver:
def onStart() {
// Start the thread that receives data over a connection
new Thread("Receiver") {
override def run() { receive() }
}.start()
}
private def receive(): Unit = {
....
val response: CloseableHttpResponse = httpclient.execute(req)
try {
val sl = response.getStatusLine()
if (sl.getStatusCode != 200){
val errorMsg = "Error: " + sl.getStatusCode
val thrw = new RuntimeException(errorMsg)
stop(errorMsg, thrw)
} else {
...
store(doc)
}
We have a spark streaming application that uses this receiver:
val ssc = new StreamingContext(sparkConf, duration)
val changes = ssc.receiverStream(new CustomReceiver(...
...
ssc.start()
ssc.awaitTermination()
Everything works as expected if the receiver doesn't have errors. If the receiver fails (e.g. with a wrong http link), spark will continuously restart it and the application will never terminate.
16/05/31 17:03:38 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
16/05/31 17:03:38 ERROR ReceiverTracker: Receiver has been stopped. Try to restart it.
We just want to terminate the whole application if a receiver fails.
There is a way to control the life cycle of Custom receiver based spark-streaming applications. Define job progress listener for your application and keep track of what is happening.
class CustomReceiverListener extends StreamingJobProgressListener {
private boolean receiverStopped = false;
public CustomReceiverListener(StreamingContext ssc) { super(ssc);}
public boolean isReceiverStopped() {
return receiverStopped;
}
#Override
public void onReceiverStopped(StreamingListenerReceiverStopped receiverStopped) {
LOG.info("Update the flag field");
this.receiverStopped = true;
}
}
And in your driver, initialize a thread to monitor the state of receiverStopped flag. Driver will stop the stream app when this thread is finished. (Better approach is to define a callback method defined by the driver, that will stop the streaming application).
CustomReceiverListener listener = new CustomReceiverListener(ssc);
ssc.addStreamingListener(listener);
ssc.start();
Thread thread = new Thread(() -> {
while (!listener.isReceiverStopped()) {
LOG.info("Sleepy head...");
try {
Thread.sleep(2 * 1000); /*check after 2 seconds*/
} catch (InterruptedException e) {
e.printStackTrace();
}
}
});
thread.start();
thread.join();
LOG.info("Listener asked to die! Going to commit suicide :(");
ssc.stop(true, false);
Note: In case of multiple instances of your receivers, change the implementation of CustomReceiverListener to make sure all the receiver instances are stopped.
It seems that the scheduling in Spark Streaming works in such a way that ReceiverTracker will keep restarting a failed receiver until ReceiverTracker is not stopped itself.
https://github.com/apache/spark/blob/master/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala#L618
To stop ReceiverTracker, we need to stop the whole application. Thus, it seems there is no a way to control this process from a receiver itself.

Websphere MQ as a data source for Apache Spark Streaming

I was digging into the possibilities for Websphere MQ as a data source for spark-streaming becuase it is needed in one of our use case.
I got to know that MQTT is the protocol that supports the communication from MQ data structures but since I am a newbie to spark streaming I need some working examples for the same.
Did anyone try to connect the MQ with spark streaming. Please devise the best way for doing so.
So, I am posting here the working code for CustomMQReceiver which connects the Websphere MQ and reads data :
public class CustomMQReciever extends Receiver<String> { String host = null;
int port = -1;
String qm=null;
String qn=null;
String channel=null;
transient Gson gson=new Gson();
transient MQQueueConnection qCon= null;
Enumeration enumeration =null;
public CustomMQReciever(String host , int port, String qm, String channel, String qn) {
super(StorageLevel.MEMORY_ONLY_2());
this.host = host;
this.port = port;
this.qm=qm;
this.qn=qn;
this.channel=channel;
}
public void onStart() {
// Start the thread that receives data over a connection
new Thread() {
#Override public void run() {
try {
initConnection();
receive();
}
catch (JMSException ex)
{
ex.printStackTrace();
}
}
}.start();
}
public void onStop() {
// There is nothing much to do as the thread calling receive()
// is designed to stop by itself isStopped() returns false
}
/** Create a MQ connection and receive data until receiver is stopped */
private void receive() {
System.out.print("Started receiving messages from MQ");
try {
JMSMessage receivedMessage= null;
while (!isStopped() && enumeration.hasMoreElements() )
{
receivedMessage= (JMSMessage) enumeration.nextElement();
String userInput = convertStreamToString(receivedMessage);
//System.out.println("Received data :'" + userInput + "'");
store(userInput);
}
// Restart in an attempt to connect again when server is active again
//restart("Trying to connect again");
stop("No More Messages To read !");
qCon.close();
System.out.println("Queue Connection is Closed");
}
catch(Exception e)
{
e.printStackTrace();
restart("Trying to connect again");
}
catch(Throwable t) {
// restart if there is any other error
restart("Error receiving data", t);
}
}
public void initConnection() throws JMSException
{
MQQueueConnectionFactory conFactory= new MQQueueConnectionFactory();
conFactory.setHostName(host);
conFactory.setPort(port);
conFactory.setTransportType(JMSC.MQJMS_TP_CLIENT_MQ_TCPIP);
conFactory.setQueueManager(qm);
conFactory.setChannel(channel);
qCon= (MQQueueConnection) conFactory.createQueueConnection();
MQQueueSession qSession=(MQQueueSession) qCon.createQueueSession(false, 1);
MQQueue queue=(MQQueue) qSession.createQueue(qn);
MQQueueBrowser browser = (MQQueueBrowser) qSession.createBrowser(queue);
qCon.start();
enumeration= browser.getEnumeration();
}
#Override
public StorageLevel storageLevel() {
return StorageLevel.MEMORY_ONLY_2();
}
}
I believe you can use JMS to connect to connect Websphere MQ, and Apache Camel can be used to connect to Websphere MQ. You can create a custom Receiver like so (note that this pattern could also be used without JMS):
class JMSReceiver(topicName: String, cf: String, jndiProviderURL: String)
extends Receiver[String](StorageLevel.MEMORY_AND_DISK_SER) with Serializable {
//Transient as this will get passed to the Workers from the Driver
#transient
var camelContextOption: Option[DefaultCamelContext] = None
def onStart() = {
camelContextOption = Some(new DefaultCamelContext())
val camelContext = camelContextOption.get
val env = new Properties()
env.setProperty("java.naming.factory.initial", "???")
env.setProperty("java.naming.provider.url", jndiProviderURL)
env.setProperty("com.webmethods.jms.clientIDSharing", "true")
val namingContext = new InitialContext(env); //using the properties file to create context
//Lookup Connection Factory
val connectionFactory = namingContext.lookup(cf).asInstanceOf[javax.jms.ConnectionFactory]
camelContext.addComponent("jms", JmsComponent.jmsComponentAutoAcknowledge(connectionFactory))
val builder = new RouteBuilder() {
def configure() = {
from(s"jms://topic:$topicName?jmsMessageType=Object&clientId=$clientId&durableSubscriptionName=${topicName}_SparkDurable&maxConcurrentConsumers=10")
.process(new Processor() {
def process(exchange: Exchange) = {
exchange.getIn.getBody match {
case s: String => store(s)
}
}
})
}
}
}
builders.foreach(camelContext.addRoutes)
camelContext.start()
}
def onStop() = if(camelContextOption.isDefined) camelContextOption.get.stop()
}
You can then create a DStream of your events like so:
val myDStream = ssc.receiverStream(new JMSReceiver("MyTopic", "MyContextFactory", "MyJNDI"))

Resources