Spark Kafka Task not serializable - apache-spark

I have run to a wall on getting around a Task not serializable when trying to break out a spark application into classes and use Try also.
The Code pulls from S3 for schema, does a streaming read from Kafka (which the topic is avro format with schema reg).
I have tried using the class and not using the class... but in both cases I'm getting a serz error relating to a closure.. which I guess something is being pulled in when it is trying to serz. This error haunts me always.. such a huge pain to get around. If someone could shed some light on how I can avoid this issue that would be awesome. These Java classes seem to have more issues than they are worth sometimes.
import java.util.Properties
import com.databricks.spark.avro._
import io.confluent.kafka.schemaregistry.client.rest.RestService
import io.confluent.kafka.serializers.{AbstractKafkaAvroSerDeConfig, KafkaAvroDecoder, KafkaAvroDeserializerConfig}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.spark.sql.functions.{col, from_json}
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.util.{Failure, Success, Try}
case class DeserializedFromKafkaRecord(value: String)
class sparkS3() extends Serializable {
def readpeopleSchemaDF(spark: SparkSession, topicSchemaLocation: String): scala.util.Try[StructType] = {
val read: scala.util.Try[StructType] = Try(
spark
.read
.option("header", "true")
.format("com.databricks.spark.avro")
.load(topicSchemaLocation)
.schema
)
read
}
def writeTopicDF(peopleDFstream: DataFrame,
peopleDFstreamCheckpoint: String,
peopleDFstreamLocation: String): scala.util.Try[StreamingQuery] = {
val write: scala.util.Try[StreamingQuery] = Try(
peopleDFstream
.writeStream
.option("checkpointLocation", peopleDFstreamCheckpoint)
.format("com.databricks.spark.avro")
.option("path", peopleDFstreamLocation)
.start()
)
write
}
}
class sparkKafka() extends Serializable {
def readpeopleTopicDF(spark: SparkSession, topicSchema: StructType): scala.util.Try[DataFrame] = {
val brokers = "URL:9092"
val schemaRegistryURL = "URL:8081"
val kafkaParams = Map[String, String](
"kafka.bootstrap.servers" -> brokers,
"key.deserializer" -> "KafkaAvroDeserializer",
"value.deserializer" -> "KafkaAvroDeserializer",
"group.id" -> "structured-kafka",
//"auto.offset.reset" -> "latest",
"failOnDataLoss" -> "false",
"schema.registry.url" -> schemaRegistryURL
)
var kafkaTopic = "people"
object avroDeserializerWrapper {
val props = new Properties()
props.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryURL)
props.put(KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG, "true")
val vProps = new kafka.utils.VerifiableProperties(props)
val deser = new KafkaAvroDecoder(vProps)
val avro_schema = new RestService(schemaRegistryURL).getLatestVersion(kafkaTopic + "-value")
val messageSchema = new Schema.Parser().parse(avro_schema.getSchema)
}
import spark.implicits._
val read: scala.util.Try[DataFrame] = Try(
{
val peopleStringDF = {
spark
.readStream
.format("kafka")
.option("subscribe", kafkaTopic)
.option("kafka.bootstrap.servers", brokers)
.options(kafkaParams)
.load()
.map(x => {
DeserializedFromKafkaRecord(avroDeserializerWrapper.deser.fromBytes(
x
.getAs[Array[Byte]]("value"), avroDeserializerWrapper.messageSchema)
.asInstanceOf[GenericData.Record].toString)
})
}
val peopleJsonDF = {
peopleStringDF
.select(
from_json(col("value")
.cast("string"), topicSchema)
.alias("people"))
}
peopleJsonDF.select("people.*")
})
read
}
}
object peopleDataLakePreprocStage1 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("peoplePreProcConsumerStage1")
.getOrCreate()
val topicSchemaLocation = "URL"
val topicDFstreamCheckpoint = "URL"
val topicDFstreamLocation = "URL"
val sparkKafka = new sparkKafka()
val sparkS3 = new sparkS3()
sparkS3.readpepleSchemaDF(spark, topicSchemaLocation) match {
case Success(topicSchema) => {
sparkKafka.readpeopletTopicDF(spark, topicSchema) match {
case Success(df) => {
sparkS3.writeTopicDF(df, topicDFstreamCheckpoint, topicDFstreamLocation) match {
case Success(query) => {
query.awaitTermination()
}
case Failure(f) => println(f)
}
}
case Failure(f) => println(f)
}
}
case Failure(f) => println(f)
}
}
}
Here is the error
java.lang.IllegalStateException: s3a://... when compacting batch 9 (compactInterval: 10)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:173)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:172)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.NumericRange.foreach(NumericRange.scala:73)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.compact(CompactibleFileStreamLog.scala:172)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.add(CompactibleFileStreamLog.scala:156)
at org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol.commitJob(ManifestFileCommitProtocol.scala:64)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:213)
at org.apache.spark.sql.execution.streaming.FileStreamSink.addBatch(FileStreamSink.scala:123)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
18/08/10 13:04:07 ERROR MicroBatchExecution: Query [id = 2876ded4-f223-40c4-8634-0c8feec94bf6, runId = 9b9a1347-7a80-4295-bb6e-ff2de18eeaf4] terminated with error
org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:224)
at org.apache.spark.sql.execution.streaming.FileStreamSink.addBatch(FileStreamSink.scala:123)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
Caused by: java.lang.IllegalStateException: s3a://..../_spark_metadata/0 doesn't exist when compacting batch 9 (compactInterval: 10)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:173)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:172)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.NumericRange.foreach(NumericRange.scala:73)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.compact(CompactibleFileStreamLog.scala:172)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.add(CompactibleFileStreamLog.scala:156)
at org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol.commitJob(ManifestFileCommitProtocol.scala:64)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:213)
... 17 more

The resolution was one of two (or both) things.. extend serialzation on the class , separate files in the same namespace. I have updated the code above to reflect

Just a stab. In class sparkS3 you are using 'var' to define those values - did you mean 'val'?

Related

why does spark return "task not serializable" if i run this code?

I have written one simple code in spark.
That is getting the file location from the dataframe columns and returns the string whether it is exist or not.
But once i run this it will throw a "task not serializable".
Can someone please help me to get out of this error?
object filetospark{
def main(args: Array[String]) : Unit = {
val spark = SparkSession
.builder()
.appName("app1")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val path: String => String = (Path: String) => {
val exists = fs.exists(new Path(Path))
var result = " "
if (exists) {
result = "Y"
}
else {
result = "N"
}
result
}
val PATH = udf(path)
val config_df=spark.read.
option("header","true").
option("inferSchema","true").
csv("pathlocation")
val current_date=LocalDate.now()
val instance_table_df=instance_df.withColumn("is_available",PATH(col("file_name")))
error like this
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:393)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:850)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:849)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:849)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:613)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:339)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3384)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2545)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2759)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:255)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:292)
at org.apache.spark.sql.Dataset.show(Dataset.scala:746)
at org.apache.spark.sql.Dataset.show(Dataset.scala:705)
at org.apache.spark.sql.Dataset.show(Dataset.scala:714)
at filetospark$.main(filetospark.scala:40)
at filetospark.main(filetospark.scala)
Caused by: java.io.NotSerializableException: org.apache.hadoop.fs.LocalFileSystem
Serialization stack:
- object not serializable (class: org.apache.hadoop.fs.LocalFileSystem, value: org.apache.hadoop.fs.LocalFileSystem#7fd3fd06)
- field (class: filetospark$$anonfun$1, name: fs$1, type: class org.apache.hadoop.fs.FileSystem)
- object (class filetospark$$anonfun$1, <function1>)
- element of array (index: 4)
- array (class [Ljava.lang.Object;, size 5)
- field (class: org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11, name: references$1, type: class [Ljava.lang.Object;)
- object (class org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11, <function2>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:400)
... 36 more
It shows this error someone could please solve this problem
object filetospark{
val spark = SparkSession
.builder()
.appName("app1")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val path: String => String = (Path: String) => {
val exists = fs.exists(new Path(Path))
var result = " "
if (exists) {
result = "Y"
}
else {
print("N")
result = "N"
}
result
}
def main(args: Array[String]) : Unit = {
val PATH = udf(path)
val newfu=udf(newfun)
val config_df=spark.read.
option("header","true").
option("inferSchema","true").
csv("filepath")
val current_date=LocalDate.now()
val instance_table_df=instance_df.withColumn("is_available",PATH(col("file_name")))
instance_table_df.show()
}
}
I don't know what is happening here.Now that error was cleared.But my doubt is still here.
I just create the spark session outside the main function.it works fine.But i dont know what is happening here.If any one knows please post here.

How pass Basic Authentication to Confluent Schema Registry?

I want to read data from a confluent cloud topic and then write in another topic.
At localhost, I haven't had any major problems. But the schema registry of confluent cloud requires to pass some authentication data that I don't know how to enter them:
basic.auth.credentials.source=USER_INFO
schema.registry.basic.auth.user.info=:
schema.registry.url=https://xxxxxxxxxx.confluent.cloudBlockquote
Below is the current code:
import com.databricks.spark.avro.SchemaConverters
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.spark.sql.SparkSession
object AvroConsumer {
private val topic = "transactions"
private val kafkaUrl = "http://localhost:9092"
private val schemaRegistryUrl = "http://localhost:8081"
private val schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
private val kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)
private val avroSchema = schemaRegistryClient.getLatestSchemaMetadata(topic + "-value").getSchema
private var sparkSchema = SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("ConfluentConsumer")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark.udf.register("deserialize", (bytes: Array[Byte]) =>
DeserializerWrapper.deserializer.deserialize(bytes)
)
val kafkaDataFrame = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", kafkaUrl)
.option("subscribe", topic)
.load()
val valueDataFrame = kafkaDataFrame.selectExpr("""deserialize(value) AS message""")
import org.apache.spark.sql.functions._
val formattedDataFrame = valueDataFrame.select(
from_json(col("message"), sparkSchema.dataType).alias("parsed_value"))
.select("parsed_value.*")
formattedDataFrame
.writeStream
.format("console")
.option("truncate", false)
.start()
.awaitTermination()
}
object DeserializerWrapper {
val deserializer = kafkaAvroDeserializer
}
class AvroDeserializer extends AbstractKafkaAvroDeserializer {
def this(client: SchemaRegistryClient) {
this()
this.schemaRegistry = client
}
override def deserialize(bytes: Array[Byte]): String = {
val genericRecord = super.deserialize(bytes).asInstanceOf[GenericRecord]
genericRecord.toString
}
}
}
I think I have to pass this authentication data to CachedSchemaRegistryClient but I'm not sure if so and how.
I've finally been able to pass the properties.
I leave the lines that gave the solution.
val restService = new RestService(schemaRegistryURL)
val props = Map(
"basic.auth.credentials.source" -> "USER_INFO",
"schema.registry.basic.auth.user.info" -> "secret:secret"
).asJava
var schemaRegistryClient = new CachedSchemaRegistryClient(restService, 100, props)

Kudu Client fails with exceptions after running for few days

I have a Scala/Spark/Kafka process that I run. When I first start the process I create a KuduClient Object using a function I made that I share between classes. For this job I only create the KuduClient once, and let the process run continuously. I've noticed that after several days I frequently get exceptions.
I'm not really sure what to do. I think maybe an option would be to create a new Kudu client every day or so but I'm unsure of how to do that in this case as well.
import org.apache.spark.SparkConf
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.json.JSONObject
import org.apache.kudu.client.KuduClient
import org.apache.log4j.Logger
object Thing extends Serializable {
#transient lazy val client: KuduClient = createKuduClient(config)
#transient lazy val logger: Logger = Logger.getLogger(getClass.getName)
def main(args: Array[String]) {
UtilFunctions.loadConfig(args) //I send back a config object.
UtilFunctions.loadLogger() //factory method to load logger
val props: Map[String, String] = setKafkaProperties()
val topic = Set(config.getString("config.TOPIC_NAME"))
val conf = new SparkConf().setMaster("local[2]").setAppName(config.getString("config.SPARK_APP_NAME"))
val ssc = new StreamingContext(conf, Seconds(10))
ssc.sparkContext.setLogLevel("ERROR")
ssc.checkpoint(config.getString("config.SPARK_CHECKPOINT_NAME"))
// val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, props, topic)
val kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topic, props))
val distRecordsStream = kafkaStream.map(record => (record.key(), record.value()))
distRecordsStream.window(Seconds(10), Seconds(10))
distRecordsStream.foreachRDD(distRecords => {
logger.info(distRecords + " : " + distRecords.count())
distRecords.foreach(record => {
logger.info(record._2)
MyClass.DoSomethingWithThisData(new JSONObject(record._2), client)
})
})
ssc.start()
ssc.awaitTermination()
}
def createKuduClient(config: Config): KuduClient = {
var client: KuduClient = null
try{
client = new KuduClient.KuduClientBuilder(config.getString("config.KUDU_MASTER"))
.defaultAdminOperationTimeoutMs(config.getInt("config.KUDU_ADMIN_TIMEOUT_S") * 1000)
.defaultOperationTimeoutMs(config.getInt("config.KUDU_OPERATION_TIMEOUT_S") * 1000)
.build()
}
catch {
case e: Throwable =>
logger.error(e.getMessage)
logger.error(e.getStackTrace.toString)
Thread.sleep(10000) //try to create a new kudu client
client = createKuduClient(config)
}
client //return
}
def setKafkaProperties(): Map[String, String] = {
val zookeeper = config.getString("config.ZOOKEEPER")
val offsetReset = config.getString("config.OFFSET_RESET")
val brokers = config.getString("config.BROKERS")
val groupID = config.getString("config.GROUP_ID")
val deserializer = config.getString("config.DESERIALIZER")
val autoCommit = config.getString("config.AUTO_COMMIT")
val maxPollRecords = config.getString("config.MAX_POLL_RECORDS")
val maxPollIntervalms = config.getString("config.MAX_POLL_INTERVAL_MS")
val props = Map(
"bootstrap.servers" -> brokers,
"zookeeper.connect" -> zookeeper,
"group.id" -> groupID,
"key.deserializer" -> deserializer,
"value.deserializer" -> deserializer,
"enable.auto.commit" -> autoCommit,
"auto.offset.reset" -> offsetReset,
"max.poll.records" -> maxPollRecords,
"max.poll.interval.ms" -> maxPollIntervalms)
props
}
}
Exceptions below. I've removed the IP address inplace of using "x"
ERROR client.TabletClient: [Peer
master-ip-xxx-xx-xxx-40.ec2.internal:7051] Unexpected exception from
downstream on [id: 0x42ba3f4d, /xxx.xx.xxx.39:36820 =>
ip-xxx-xxx-xxx-40.ec2.internal/xxx.xx.xxx.40:7051]
java.lang.RuntimeException: Could not deserialize the response,
incompatible RPC? Error is: step
at org.apache.kudu.client.KuduRpc.readProtobuf(KuduRpc.java:383)
at org.apache.kudu.client.Negotiator.parseSaslMsgResponse(Negotiator.java:282)
at org.apache.kudu.client.Negotiator.handleResponse(Negotiator.java:235)
at org.apache.kudu.client.Negotiator.messageReceived(Negotiator.java:229)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.timeout.ReadTimeoutHandler.messageReceived(ReadTimeoutHandler.java:184)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.oneone.OneToOneDecoder.handleUpstream(OneToOneDecoder.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:462)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.callDecode(FrameDecoder.java:443)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.messageReceived(FrameDecoder.java:310)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:462)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.callDecode(FrameDecoder.java:443)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.messageReceived(FrameDecoder.java:303)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:268)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:255)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:88)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioWorker.process(AbstractNioWorker.java:108)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:337)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:89)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
at org.apache.kudu.client.shaded.org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
at org.apache.kudu.client.shaded.org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I've also seen exceptions like these after running for a while which others seem to attribute to being the open file handles limit of your user.
java.io.IOException: All datanodes
DatanodeInfoWithStorage[xxx.xx.xxx.36:1004,DS-55c403c3-203a-4dac-b383-72fcdb686185,DISK]
are bad. Aborting...
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.setupPipelineForAppendOrRecovery(DFSOutputStream.java:1465)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.processDatanodeError(DFSOutputStream.java:1236)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputSt
Is this have something too do with having too many open files? A way to "purge" these files once they reach a limit?

Error while kafka spark connection

I try to connect kafka to spark. I use kafka_2.11-0.11.0.1 and spark 2.2.0. I included jar files as:
kafka_2.11-0.11.0.1
kafka-clients-0.11.0.1
spark-streaming_2.11-2.2.0
spark-streaming-kafka_2.11-2.2.0
and here is my code:
import org.apache.spark._
import org.apache.spark.streaming.dstream._
import org.apache.spark.streaming.kafka._
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka
import org.apache.spark.streaming.kafka.KafkaUtils
object KafkaExample {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(20))
val kafkaParams = Map("metadata.broker.list" -> "kafkaIP:9092")
val topics = Set("logstash_log")
val stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc,kafkaParams,topics)
stream.print()
ssc.checkpoint("C:/checkpoint/")
ssc.start()
ssc.awaitTermination()
}
}
I got this response and couldn't find the solution anywhere:
Exception in thread "main" java.lang.NoSuchMethodError: kafka.api.TopicMetadata.errorCode()S
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1$$anonfun$4.apply(KafkaCluster.scala:127)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1$$anonfun$4.apply(KafkaCluster.scala:127)
at scala.collection.TraversableLike$$anonfun$filterImpl$1.apply(TraversableLike.scala:248)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.filterImpl(TraversableLike.scala:247)
at scala.collection.TraversableLike$class.filter(TraversableLike.scala:259)
at scala.collection.AbstractTraversable.filter(Traversable.scala:104)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1.apply(KafkaCluster.scala:127)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1.apply(KafkaCluster.scala:125)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers$1.apply(KafkaCluster.scala:346)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers$1.apply(KafkaCluster.scala:342)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at org.apache.spark.streaming.kafka.KafkaCluster.org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers(KafkaCluster.scala:342)
at org.apache.spark.streaming.kafka.KafkaCluster.getPartitionMetadata(KafkaCluster.scala:125)
at org.apache.spark.streaming.kafka.KafkaCluster.getPartitions(KafkaCluster.scala:112)
at org.apache.spark.streaming.kafka.KafkaUtils$.getFromOffsets(KafkaUtils.scala:211)
at org.apache.spark.streaming.kafka.KafkaUtils$.createDirectStream(KafkaUtils.scala:484)
at com.defne.KafkaExample$.main(KafkaExample.scala:27)
at com.defne.KafkaExample.main(KafkaExample.scala)
Why does this occur? How can I handle this? Any help will be greatly appreciated!
Thanks.
It may be helpful. You can modify based on your dataset and IP address
def StreamingFromKafkaMain(): Unit =
{
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.34.216:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("myTopicName")
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("KafkaTest")
val streamingContext = new StreamingContext(sparkConf, Seconds(1))
// Create a input direct stream
val kafkaStream = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val spark = SparkSession.builder().master("local[*]").appName("KafkaTest").getOrCreate()
val items = kafkaStream.map(record => (record.key, record.value.split("\n")))
val itemStatus = items.map(status => status.toString())
items.foreachRDD(
rddm => if (!rddm.isEmpty()) {
//val my_dataset=StreamingFromKafkaOracleMain();
println("Test")
//my_dataset.show
//val df1 = rddm.map(_.mkString(",")).map(x=> schema(x(0).toString,x(1).toInt,x(2).toString)).toDF()
val splittedRdd =rddm.map(line=>line.toString().split(","))
println(splittedRdd.take(10))
}
)
streamingContext.start()
streamingContext.awaitTermination()
}

UnaryTransformer instance throwing ClassCastException

I have a requirement to create my own UnaryTransformer instance that accepts a Dataframe Column of type Array[String] and should also output the same type.In trying to do so,I encountered a ClassCastException on my Spark version 2.1.0.
I've put together a sample test that shows my case.
import org.apache.spark.SparkConf
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
class MyTransformer(override val uid:String) extends UnaryTransformer[Array[String],Array[String],MyTransformer] {
override protected def createTransformFunc: (Array[String]) => Array[String] = {
param1 => {
param1.foreach(println(_))
param1
}
}
override protected def outputDataType: DataType = ArrayType(StringType)
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == ArrayType(StringType), s"Data type mismatch between Array[String] and provided type $inputType.")
}
def this() = this( Identifiable.randomUID("tester") )
}
object Tester {
def main(args: Array[String]): Unit = {
val config = new SparkConf().setAppName("Tester")
implicit val sparkSession = SparkSession.builder().config(config).getOrCreate()
import sparkSession.implicits._
val dataframe = Seq(Array("Firstly" , "F1"),Array("Driving" , "S1" ),Array("Ran" , "T3" ),Array("Fourth" ,"F4"), Array("Running" , "F5")
,Array("Gone" , "S6")).toDF("input")
val transformer = new MyTransformer().setInputCol("input").setOutputCol("output")
val transformed = transformer.transform(dataframe)
transformed.select("output").show()
println("Complete....")
sparkSession.close()
}
}
Attaching the stack trace for reference
Exception in thread "main" org.apache.spark.SparkException: Failed to
execute user defined function($anonfun$createTransformFunc$1:
(array) => array) at
org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1072)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:144)
at
org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:48)
at
org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:30)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.List.foreach(List.scala:392) at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.immutable.List.map(List.scala:296) at
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$21.applyOrElse(Optimizer.scala:1078)
at
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$21.applyOrElse(Optimizer.scala:1073)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:287)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:277)
at
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$.apply(Optimizer.scala:1073)
at
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$.apply(Optimizer.scala:1072)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
at
scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at
scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at
scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
at scala.collection.immutable.List.foreach(List.scala:392) at
org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
at
org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:73)
at
org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:73)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:79)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:75)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:84)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:84)
at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2791)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2112) at
org.apache.spark.sql.Dataset.take(Dataset.scala:2327) at
org.apache.spark.sql.Dataset.showString(Dataset.scala:248) at
org.apache.spark.sql.Dataset.show(Dataset.scala:636) at
org.apache.spark.sql.Dataset.show(Dataset.scala:595) at
org.apache.spark.sql.Dataset.show(Dataset.scala:604) at
Tester$.main(Tester.scala:45) at Tester.main(Tester.scala)
Caused by: java.lang.ClassCastException: scala.collection.mutable.WrappedArray$ofRef cannot be cast to
[Ljava.lang.String; at
MyTransformer$$anonfun$createTransformFunc$1.apply(Tester.scala:9)
at
org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2.apply(ScalaUDF.scala:89)
at
org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2.apply(ScalaUDF.scala:88)
at
org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1069)
... 53 more
ArrayType is represented as Seq not Array:
override protected def createTransformFunc: (Seq[String]) => Seq[String] = {
param1 => {
param1.foreach(println(_))
param1
}
}

Resources