I have a Scala/Spark/Kafka process that I run. When I first start the process I create a KuduClient Object using a function I made that I share between classes. For this job I only create the KuduClient once, and let the process run continuously. I've noticed that after several days I frequently get exceptions.
I'm not really sure what to do. I think maybe an option would be to create a new Kudu client every day or so but I'm unsure of how to do that in this case as well.
import org.apache.spark.SparkConf
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.json.JSONObject
import org.apache.kudu.client.KuduClient
import org.apache.log4j.Logger
object Thing extends Serializable {
#transient lazy val client: KuduClient = createKuduClient(config)
#transient lazy val logger: Logger = Logger.getLogger(getClass.getName)
def main(args: Array[String]) {
UtilFunctions.loadConfig(args) //I send back a config object.
UtilFunctions.loadLogger() //factory method to load logger
val props: Map[String, String] = setKafkaProperties()
val topic = Set(config.getString("config.TOPIC_NAME"))
val conf = new SparkConf().setMaster("local[2]").setAppName(config.getString("config.SPARK_APP_NAME"))
val ssc = new StreamingContext(conf, Seconds(10))
ssc.sparkContext.setLogLevel("ERROR")
ssc.checkpoint(config.getString("config.SPARK_CHECKPOINT_NAME"))
// val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, props, topic)
val kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topic, props))
val distRecordsStream = kafkaStream.map(record => (record.key(), record.value()))
distRecordsStream.window(Seconds(10), Seconds(10))
distRecordsStream.foreachRDD(distRecords => {
logger.info(distRecords + " : " + distRecords.count())
distRecords.foreach(record => {
logger.info(record._2)
MyClass.DoSomethingWithThisData(new JSONObject(record._2), client)
})
})
ssc.start()
ssc.awaitTermination()
}
def createKuduClient(config: Config): KuduClient = {
var client: KuduClient = null
try{
client = new KuduClient.KuduClientBuilder(config.getString("config.KUDU_MASTER"))
.defaultAdminOperationTimeoutMs(config.getInt("config.KUDU_ADMIN_TIMEOUT_S") * 1000)
.defaultOperationTimeoutMs(config.getInt("config.KUDU_OPERATION_TIMEOUT_S") * 1000)
.build()
}
catch {
case e: Throwable =>
logger.error(e.getMessage)
logger.error(e.getStackTrace.toString)
Thread.sleep(10000) //try to create a new kudu client
client = createKuduClient(config)
}
client //return
}
def setKafkaProperties(): Map[String, String] = {
val zookeeper = config.getString("config.ZOOKEEPER")
val offsetReset = config.getString("config.OFFSET_RESET")
val brokers = config.getString("config.BROKERS")
val groupID = config.getString("config.GROUP_ID")
val deserializer = config.getString("config.DESERIALIZER")
val autoCommit = config.getString("config.AUTO_COMMIT")
val maxPollRecords = config.getString("config.MAX_POLL_RECORDS")
val maxPollIntervalms = config.getString("config.MAX_POLL_INTERVAL_MS")
val props = Map(
"bootstrap.servers" -> brokers,
"zookeeper.connect" -> zookeeper,
"group.id" -> groupID,
"key.deserializer" -> deserializer,
"value.deserializer" -> deserializer,
"enable.auto.commit" -> autoCommit,
"auto.offset.reset" -> offsetReset,
"max.poll.records" -> maxPollRecords,
"max.poll.interval.ms" -> maxPollIntervalms)
props
}
}
Exceptions below. I've removed the IP address inplace of using "x"
ERROR client.TabletClient: [Peer
master-ip-xxx-xx-xxx-40.ec2.internal:7051] Unexpected exception from
downstream on [id: 0x42ba3f4d, /xxx.xx.xxx.39:36820 =>
ip-xxx-xxx-xxx-40.ec2.internal/xxx.xx.xxx.40:7051]
java.lang.RuntimeException: Could not deserialize the response,
incompatible RPC? Error is: step
at org.apache.kudu.client.KuduRpc.readProtobuf(KuduRpc.java:383)
at org.apache.kudu.client.Negotiator.parseSaslMsgResponse(Negotiator.java:282)
at org.apache.kudu.client.Negotiator.handleResponse(Negotiator.java:235)
at org.apache.kudu.client.Negotiator.messageReceived(Negotiator.java:229)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.timeout.ReadTimeoutHandler.messageReceived(ReadTimeoutHandler.java:184)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.oneone.OneToOneDecoder.handleUpstream(OneToOneDecoder.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:462)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.callDecode(FrameDecoder.java:443)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.messageReceived(FrameDecoder.java:310)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:462)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.callDecode(FrameDecoder.java:443)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.messageReceived(FrameDecoder.java:303)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:268)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:255)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:88)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioWorker.process(AbstractNioWorker.java:108)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:337)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:89)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
at org.apache.kudu.client.shaded.org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
at org.apache.kudu.client.shaded.org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I've also seen exceptions like these after running for a while which others seem to attribute to being the open file handles limit of your user.
java.io.IOException: All datanodes
DatanodeInfoWithStorage[xxx.xx.xxx.36:1004,DS-55c403c3-203a-4dac-b383-72fcdb686185,DISK]
are bad. Aborting...
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.setupPipelineForAppendOrRecovery(DFSOutputStream.java:1465)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.processDatanodeError(DFSOutputStream.java:1236)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputSt
Is this have something too do with having too many open files? A way to "purge" these files once they reach a limit?
Related
I have run to a wall on getting around a Task not serializable when trying to break out a spark application into classes and use Try also.
The Code pulls from S3 for schema, does a streaming read from Kafka (which the topic is avro format with schema reg).
I have tried using the class and not using the class... but in both cases I'm getting a serz error relating to a closure.. which I guess something is being pulled in when it is trying to serz. This error haunts me always.. such a huge pain to get around. If someone could shed some light on how I can avoid this issue that would be awesome. These Java classes seem to have more issues than they are worth sometimes.
import java.util.Properties
import com.databricks.spark.avro._
import io.confluent.kafka.schemaregistry.client.rest.RestService
import io.confluent.kafka.serializers.{AbstractKafkaAvroSerDeConfig, KafkaAvroDecoder, KafkaAvroDeserializerConfig}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.spark.sql.functions.{col, from_json}
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.util.{Failure, Success, Try}
case class DeserializedFromKafkaRecord(value: String)
class sparkS3() extends Serializable {
def readpeopleSchemaDF(spark: SparkSession, topicSchemaLocation: String): scala.util.Try[StructType] = {
val read: scala.util.Try[StructType] = Try(
spark
.read
.option("header", "true")
.format("com.databricks.spark.avro")
.load(topicSchemaLocation)
.schema
)
read
}
def writeTopicDF(peopleDFstream: DataFrame,
peopleDFstreamCheckpoint: String,
peopleDFstreamLocation: String): scala.util.Try[StreamingQuery] = {
val write: scala.util.Try[StreamingQuery] = Try(
peopleDFstream
.writeStream
.option("checkpointLocation", peopleDFstreamCheckpoint)
.format("com.databricks.spark.avro")
.option("path", peopleDFstreamLocation)
.start()
)
write
}
}
class sparkKafka() extends Serializable {
def readpeopleTopicDF(spark: SparkSession, topicSchema: StructType): scala.util.Try[DataFrame] = {
val brokers = "URL:9092"
val schemaRegistryURL = "URL:8081"
val kafkaParams = Map[String, String](
"kafka.bootstrap.servers" -> brokers,
"key.deserializer" -> "KafkaAvroDeserializer",
"value.deserializer" -> "KafkaAvroDeserializer",
"group.id" -> "structured-kafka",
//"auto.offset.reset" -> "latest",
"failOnDataLoss" -> "false",
"schema.registry.url" -> schemaRegistryURL
)
var kafkaTopic = "people"
object avroDeserializerWrapper {
val props = new Properties()
props.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryURL)
props.put(KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG, "true")
val vProps = new kafka.utils.VerifiableProperties(props)
val deser = new KafkaAvroDecoder(vProps)
val avro_schema = new RestService(schemaRegistryURL).getLatestVersion(kafkaTopic + "-value")
val messageSchema = new Schema.Parser().parse(avro_schema.getSchema)
}
import spark.implicits._
val read: scala.util.Try[DataFrame] = Try(
{
val peopleStringDF = {
spark
.readStream
.format("kafka")
.option("subscribe", kafkaTopic)
.option("kafka.bootstrap.servers", brokers)
.options(kafkaParams)
.load()
.map(x => {
DeserializedFromKafkaRecord(avroDeserializerWrapper.deser.fromBytes(
x
.getAs[Array[Byte]]("value"), avroDeserializerWrapper.messageSchema)
.asInstanceOf[GenericData.Record].toString)
})
}
val peopleJsonDF = {
peopleStringDF
.select(
from_json(col("value")
.cast("string"), topicSchema)
.alias("people"))
}
peopleJsonDF.select("people.*")
})
read
}
}
object peopleDataLakePreprocStage1 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("peoplePreProcConsumerStage1")
.getOrCreate()
val topicSchemaLocation = "URL"
val topicDFstreamCheckpoint = "URL"
val topicDFstreamLocation = "URL"
val sparkKafka = new sparkKafka()
val sparkS3 = new sparkS3()
sparkS3.readpepleSchemaDF(spark, topicSchemaLocation) match {
case Success(topicSchema) => {
sparkKafka.readpeopletTopicDF(spark, topicSchema) match {
case Success(df) => {
sparkS3.writeTopicDF(df, topicDFstreamCheckpoint, topicDFstreamLocation) match {
case Success(query) => {
query.awaitTermination()
}
case Failure(f) => println(f)
}
}
case Failure(f) => println(f)
}
}
case Failure(f) => println(f)
}
}
}
Here is the error
java.lang.IllegalStateException: s3a://... when compacting batch 9 (compactInterval: 10)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:173)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:172)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.NumericRange.foreach(NumericRange.scala:73)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.compact(CompactibleFileStreamLog.scala:172)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.add(CompactibleFileStreamLog.scala:156)
at org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol.commitJob(ManifestFileCommitProtocol.scala:64)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:213)
at org.apache.spark.sql.execution.streaming.FileStreamSink.addBatch(FileStreamSink.scala:123)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
18/08/10 13:04:07 ERROR MicroBatchExecution: Query [id = 2876ded4-f223-40c4-8634-0c8feec94bf6, runId = 9b9a1347-7a80-4295-bb6e-ff2de18eeaf4] terminated with error
org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:224)
at org.apache.spark.sql.execution.streaming.FileStreamSink.addBatch(FileStreamSink.scala:123)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
Caused by: java.lang.IllegalStateException: s3a://..../_spark_metadata/0 doesn't exist when compacting batch 9 (compactInterval: 10)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:173)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:172)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.NumericRange.foreach(NumericRange.scala:73)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.compact(CompactibleFileStreamLog.scala:172)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.add(CompactibleFileStreamLog.scala:156)
at org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol.commitJob(ManifestFileCommitProtocol.scala:64)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:213)
... 17 more
The resolution was one of two (or both) things.. extend serialzation on the class , separate files in the same namespace. I have updated the code above to reflect
Just a stab. In class sparkS3 you are using 'var' to define those values - did you mean 'val'?
I have streamed data from Kafka topics using Spark. This is the code that I have tried. Here I was just displaying the streaming data in console. I want to store this data as a text file in HDFS.
import _root_.kafka.serializer.DefaultDecoder
import _root_.kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.storage.StorageLevel
object StreamingDataNew {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("Kafka").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(10))
val kafkaConf = Map(
"metadata.broker.list" -> "localhost:9092",
"zookeeper.connect" -> "localhost:2181",
"group.id" -> "kafka-streaming-example",
"zookeeper.connection.timeout.ms" -> "200000"
)
val lines = KafkaUtils.createStream[Array[Byte], String, DefaultDecoder, StringDecoder](
ssc,
kafkaConf,
Map("topic-one" -> 1), // subscripe to topic and partition 1
StorageLevel.MEMORY_ONLY
)
println("printing" + lines.toString())
val words = lines.flatMap { case (x, y) => y.split(" ") }
words.print()
ssc.start()
ssc.awaitTermination()
}
}
I found that we can write the DStream using 'saveAsTextFiles'.But can someone clearly mention the steps on how to connect with Hortonworks and store in HDFS using the above scala code.
I found the answer,
This code worked out for me.
package com.spark.streaming
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
object MessageStreaming {
def main(args: Array[String]): Unit = {
println("Message streaming")
val conf = new org.apache.spark.SparkConf().setMaster("local[*]").setAppName("kafka-streaming")
val context = new SparkContext(conf)
val ssc = new StreamingContext(context, org.apache.spark.streaming.Seconds(10))
val kafkaParams = Map(
"bootstrap.servers" -> "kafka.kafka-cluster.com:9092",
"group.id" -> "kafka-streaming-example",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"auto.offset.reset" -> "latest",
"zookeeper.connection.timeout.ms" -> "200000"
)
val topics = Array("cdc-classic")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams))
val content = stream.filter(x => x.value() != null)
val sqlContext = new org.apache.spark.sql.SQLContext(context)
import sqlContext.implicits._
stream.map(_.value).foreachRDD(rdd => {
rdd.foreach(println)
if (!rdd.isEmpty()) {
rdd.toDF("value").coalesce(1).write.mode(SaveMode.Append).json("hdfs://dev1a/user/hg5tv0/hadoop/MessagesFromKafka")
}
})
ssc.start()
ssc.awaitTermination
}}
I try to connect kafka to spark. I use kafka_2.11-0.11.0.1 and spark 2.2.0. I included jar files as:
kafka_2.11-0.11.0.1
kafka-clients-0.11.0.1
spark-streaming_2.11-2.2.0
spark-streaming-kafka_2.11-2.2.0
and here is my code:
import org.apache.spark._
import org.apache.spark.streaming.dstream._
import org.apache.spark.streaming.kafka._
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka
import org.apache.spark.streaming.kafka.KafkaUtils
object KafkaExample {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(20))
val kafkaParams = Map("metadata.broker.list" -> "kafkaIP:9092")
val topics = Set("logstash_log")
val stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc,kafkaParams,topics)
stream.print()
ssc.checkpoint("C:/checkpoint/")
ssc.start()
ssc.awaitTermination()
}
}
I got this response and couldn't find the solution anywhere:
Exception in thread "main" java.lang.NoSuchMethodError: kafka.api.TopicMetadata.errorCode()S
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1$$anonfun$4.apply(KafkaCluster.scala:127)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1$$anonfun$4.apply(KafkaCluster.scala:127)
at scala.collection.TraversableLike$$anonfun$filterImpl$1.apply(TraversableLike.scala:248)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.filterImpl(TraversableLike.scala:247)
at scala.collection.TraversableLike$class.filter(TraversableLike.scala:259)
at scala.collection.AbstractTraversable.filter(Traversable.scala:104)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1.apply(KafkaCluster.scala:127)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1.apply(KafkaCluster.scala:125)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers$1.apply(KafkaCluster.scala:346)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers$1.apply(KafkaCluster.scala:342)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at org.apache.spark.streaming.kafka.KafkaCluster.org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers(KafkaCluster.scala:342)
at org.apache.spark.streaming.kafka.KafkaCluster.getPartitionMetadata(KafkaCluster.scala:125)
at org.apache.spark.streaming.kafka.KafkaCluster.getPartitions(KafkaCluster.scala:112)
at org.apache.spark.streaming.kafka.KafkaUtils$.getFromOffsets(KafkaUtils.scala:211)
at org.apache.spark.streaming.kafka.KafkaUtils$.createDirectStream(KafkaUtils.scala:484)
at com.defne.KafkaExample$.main(KafkaExample.scala:27)
at com.defne.KafkaExample.main(KafkaExample.scala)
Why does this occur? How can I handle this? Any help will be greatly appreciated!
Thanks.
It may be helpful. You can modify based on your dataset and IP address
def StreamingFromKafkaMain(): Unit =
{
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.34.216:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("myTopicName")
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("KafkaTest")
val streamingContext = new StreamingContext(sparkConf, Seconds(1))
// Create a input direct stream
val kafkaStream = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val spark = SparkSession.builder().master("local[*]").appName("KafkaTest").getOrCreate()
val items = kafkaStream.map(record => (record.key, record.value.split("\n")))
val itemStatus = items.map(status => status.toString())
items.foreachRDD(
rddm => if (!rddm.isEmpty()) {
//val my_dataset=StreamingFromKafkaOracleMain();
println("Test")
//my_dataset.show
//val df1 = rddm.map(_.mkString(",")).map(x=> schema(x(0).toString,x(1).toInt,x(2).toString)).toDF()
val splittedRdd =rddm.map(line=>line.toString().split(","))
println(splittedRdd.take(10))
}
)
streamingContext.start()
streamingContext.awaitTermination()
}
Hi am having a spark streaming program which is reading the events from eventhub and pushing it topics. for processing each batch it is taking almost 10 times the batch time.
when am trying to implement multithreading am not able to see much difference in the processing time.
Is there any way by which I can increase the performance either by doing parallel processing. or start some 1000 threads at a time and just keep pushing the messages.
class ThreadExample(msg:String) extends Thread{
override def run {
var test = new PushToTopicDriver(msg)
test.push()
// println(msg)
}
}
object HiveEventsDirectStream {
def b2s(a: Array[Byte]): String = new String(a)
def main(args: Array[String]): Unit = {
val eventhubnamespace = "namespace"
val progressdir = "/Event/DirectStream/"
val eventhubname_d = "namespacestream"
val ehParams = Map[String, String](
"eventhubs.policyname" -> "PolicyKeyName",
"eventhubs.policykey" -> "key",
"eventhubs.namespace" -> "namespace",
"eventhubs.name" -> "namespacestream",
"eventhubs.partition.count" -> "30",
"eventhubs.consumergroup" -> "$default",
"eventhubs.checkpoint.dir" -> "/EventCheckpoint_0.1",
"eventhubs.checkpoint.interval" -> "2"
)
println("testing spark")
val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[4]").setAppName("Eventhubs_Test")
conf.registerKryoClasses(Array(classOf[PublishToTopic]))
conf.set("spark.streaming.stopGracefullyOnShutdown", "true")
val sc= new SparkContext(conf)
val hiveContext = new HiveContext(sc)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val pool:ExecutorService=Executors.newFixedThreadPool(30)
val ssc = new StreamingContext(sc, Seconds(2))
var dataString :RDD[String] =sc.emptyRDD
val stream=EventHubsUtils.createDirectStreams(ssc,eventhubnamespace,progressdir,Map(eventhubname_d -> ehParams))
val kv1 = stream.map(receivedRecord => (new String(receivedRecord.getBody))).persist()
kv1.foreachRDD(rdd_1 => rdd_1.foreachPartition(line => line.foreach(msg => {var t1 = new ThreadExample(msg) t1.start()})))
ssc.start()
ssc.awaitTermination()
}
}
Thanks,
Ankush Reddy.
I tried the given process (https://azure.microsoft.com/en-in/documentation/articles/hdinsight-apache-spark-eventhub-streaming/) step by step. I have just modified the spark receiver code according to my requirement. The spark streaming consumer api when I am spark-submitting its fetching the data from EventHub as DStream[Array[Bytes]] which I am doing a foreachRDD and converting into an RDD[String] . The issue I am facing here is that the statements below the streaming line are not getting executed until I stop the program execution by pressing ctrl+c.
package com.onerm.spark
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.eventhubs.EventHubsUtils
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark._
import org.apache.spark.sql.hive.HiveContext
import java.util.concurrent.{Executors, ExecutorService}
object HiveEvents {
def b2s(a: Array[Byte]): String = new String(a)
def main(args: Array[String]): Unit = {
val ehParams = Map[String, String](
"eventhubs.policyname" -> "myreceivepolicy",
"eventhubs.policykey" -> "jgrH/5yjdMjajQ1WUAQsKAVGTu34=",
"eventhubs.namespace" -> "SparkeventHubTest-ns",
"eventhubs.name" -> "SparkeventHubTest",
"eventhubs.partition.count" -> "4",
"eventhubs.consumergroup" -> "$default",
"eventhubs.checkpoint.dir" -> "/EventCheckpoint_0.1",
"eventhubs.checkpoint.interval" -> "10"
)
val conf = new SparkConf().setAppName("Eventhubs Onerm")
val sc= new SparkContext(conf)
val hiveContext = new HiveContext(sc)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val pool:ExecutorService=Executors.newFixedThreadPool(5)
val ssc = new StreamingContext(sc, Seconds(120))
var dataString :RDD[String] =sc.emptyRDD
val stream=EventHubsUtils.createUnionStream(ssc, ehParams)
**//lines below are not getting executed until I stop the execution**
stream.print()
stream.foreachRDD {
rdd =>
if(rdd.isEmpty())
{
println("RDD IS EMPTY ")
}
else
{
dataString=rdd.map(line=>b2s(line))
println("COUNT" +dataString.count())
sqlContext.read.json(dataString).registerTempTable("jsoneventdata")
val filterData=sqlContext.sql("SELECT id,ClientProperties.PID,ClientProperties.Program,ClientProperties.Platform,ClientProperties.Version,ClientProperties.HWType,ClientProperties.OffVer,ContentID,Data,Locale,MappedSources,MarketingMessageContext.ActivityInstanceID,MarketingMessageContext.CampaignID,MarketingMessageContext.SegmentName,MarketingMessageContext.OneRMInstanceID,MarketingMessageContext.DateTimeSegmented,Source,Timestamp.Date,Timestamp.Epoch,TransactionID,UserAction,EventProcessedUtcTime,PartitionId,EventEnqueuedUtcTime from jsoneventdata")
filterData.show(10)
filterData.saveAsParquetFile("EventCheckpoint_0.1/ParquetEvent")
} }
ssc.start()
ssc.awaitTermination()
}
}