Below is part of my spark job:
def parse(evt: Event): String = {
try {
val config = new java.util.HashMap[java.lang.String, AnyRef] // Line1
config.put("key", "value") // Line2
val decoder = new DeserializerHelper(config, classOf[GenericRecord]) // Line3
val payload = decoder.deserializeData(evt.getId, evt.toBytes)
val record = payload.get("data")
record.toString
} catch {
case e :Exception => "exception:" + e.toString
}
}
try {
val inputStream = KafkaUtils.createDirectStream(
ssc,
PreferConsistent,
Subscribe[String, String](Array(inputTopic), kafkaParams)
)
val processedStream = inputStream.map(record => parse(record.value()))
processedStream.print()
} finally {
}
If I moved LINE1-LINE3 in the above codes outside parse() function, I got
Caused by: java.io.NotSerializableException: SchemaDeserializerHelper
Serialization stack:
- object not serializable (class: SchemaDeserializerHelper, value: SchemaDeserializerHelper#2e23c180)
- field (class: App$$anonfun$1, name: decoder$1, type: class SchemaDeserializerHelper)
- object (class App$$anonfun$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:342)
... 22 more
Why? I do not like to put Line1~Line3 in the parse() function, how to optimize this?
Thanks
Related
I have written one simple code in spark.
That is getting the file location from the dataframe columns and returns the string whether it is exist or not.
But once i run this it will throw a "task not serializable".
Can someone please help me to get out of this error?
object filetospark{
def main(args: Array[String]) : Unit = {
val spark = SparkSession
.builder()
.appName("app1")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val path: String => String = (Path: String) => {
val exists = fs.exists(new Path(Path))
var result = " "
if (exists) {
result = "Y"
}
else {
result = "N"
}
result
}
val PATH = udf(path)
val config_df=spark.read.
option("header","true").
option("inferSchema","true").
csv("pathlocation")
val current_date=LocalDate.now()
val instance_table_df=instance_df.withColumn("is_available",PATH(col("file_name")))
error like this
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:393)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:850)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:849)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:849)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:613)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:339)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3384)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2545)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2759)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:255)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:292)
at org.apache.spark.sql.Dataset.show(Dataset.scala:746)
at org.apache.spark.sql.Dataset.show(Dataset.scala:705)
at org.apache.spark.sql.Dataset.show(Dataset.scala:714)
at filetospark$.main(filetospark.scala:40)
at filetospark.main(filetospark.scala)
Caused by: java.io.NotSerializableException: org.apache.hadoop.fs.LocalFileSystem
Serialization stack:
- object not serializable (class: org.apache.hadoop.fs.LocalFileSystem, value: org.apache.hadoop.fs.LocalFileSystem#7fd3fd06)
- field (class: filetospark$$anonfun$1, name: fs$1, type: class org.apache.hadoop.fs.FileSystem)
- object (class filetospark$$anonfun$1, <function1>)
- element of array (index: 4)
- array (class [Ljava.lang.Object;, size 5)
- field (class: org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11, name: references$1, type: class [Ljava.lang.Object;)
- object (class org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11, <function2>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:400)
... 36 more
It shows this error someone could please solve this problem
object filetospark{
val spark = SparkSession
.builder()
.appName("app1")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val path: String => String = (Path: String) => {
val exists = fs.exists(new Path(Path))
var result = " "
if (exists) {
result = "Y"
}
else {
print("N")
result = "N"
}
result
}
def main(args: Array[String]) : Unit = {
val PATH = udf(path)
val newfu=udf(newfun)
val config_df=spark.read.
option("header","true").
option("inferSchema","true").
csv("filepath")
val current_date=LocalDate.now()
val instance_table_df=instance_df.withColumn("is_available",PATH(col("file_name")))
instance_table_df.show()
}
}
I don't know what is happening here.Now that error was cleared.But my doubt is still here.
I just create the spark session outside the main function.it works fine.But i dont know what is happening here.If any one knows please post here.
Want to put Avro messages from Kafka topics into Elasticsearch using Spark job (and SchemaRegistry with many defined schemas). I was able to read and deserialize records into Strings (json) format succesfully (with those 2 methods):
// Deserialize Avro to String
def avroToJsonString(record: GenericRecord): String = try {
val baos = new ByteArrayOutputStream
try {
val schema = record.getSchema
val jsonEncoder = EncoderFactory.get.jsonEncoder(schema, baos, false)
val avroWriter = new SpecificDatumWriter[GenericRecord](schema)
avroWriter.write(record, jsonEncoder)
jsonEncoder.flush()
baos.flush()
new String(baos.toByteArray)
} catch {
case ex: IOException =>
throw new IllegalStateException(ex)
} finally if (baos != null) baos.close()
}
// Parse JSON String
val parseJsonStream = (inStream: String) => {
try {
val parsed = Json.parse(inStream)
Option(parsed)
} catch {
case e: Exception => System.err.println("Exception while parsing JSON: " + inStream)
e.printStackTrace()
None
}
}
I'm reading record by record and I see deserialized JSON strings in debugger, everything looks fine, but for some reason I couldn't save them into Elasticsearch, because I guess RDD is needed to call saveToEs method. This is how I read avro records from Kafka:
val kafkaStream : InputDStream[ConsumerRecord[String, GenericRecord]] = KafkaUtils.createDirectStream[String, GenericRecord](ssc, PreferBrokers, Subscribe[String, GenericRecord](KAFKA_AVRO_TOPICS, kafkaParams))
val kafkaStreamParsed= kafkaStream.foreachRDD(rdd => {
rdd.foreach( x => {
val jsonString: String = avroToJsonString(x.value())
parseJsonStream(jsonString)
})
})
In case when I was reading json (not Avro) records, I was able to do it with:
EsSparkStreaming.saveToEs(kafkaStreamParsed, ELASTICSEARCH_EVENTS_INDEX + "/" + ELASTICSEARCH_TYPE)
I have an error in saveToEs method saying
Cannot resolve overloaded method 'saveToEs'
Tried to make rdd with sc.makeRDD() but had no luck either. How should I put all these records from batch job into RDD and afterward to Elasticsearch or I'm doing it all wrong?
UPDATE
Tried with solution:
val messages: DStream[Unit] = kafkaStream
.map(record => record.value)
.flatMap(record => {
val record1 = avroToJsonString(record)
JSON.parseFull(record1).map(rawMap => {
val map = rawMap.asInstanceOf[Map[String,String]]
})
})
again with the same Error (cannot resolve overloaded method)
UPDATE2
val kafkaStreamParsed: DStream[Any] = kafkaStream.map(rdd => {
val eventJSON = avroToJsonString(rdd.value())
parseJsonStream(eventJSON)
})
try {
EsSparkStreaming.saveToEs(kafkaStreamParsed, ELASTICSEARCH_EVENTS_INDEX + "/" + ELASTICSEARCH_TYPE)
} catch {
case e: Exception =>
EsSparkStreaming.saveToEs(kafkaStreamParsed, ELASTICSEARCH_FAILED_EVENTS)
e.printStackTrace()
}
Now I get the records in ES.
Using Spark 2.3.0 and Scala 2.11.8
I've managed to do it:
val kafkaStream : InputDStream[ConsumerRecord[String, GenericRecord]] = KafkaUtils.createDirectStream[String, GenericRecord](ssc, PreferBrokers, Subscribe[String, GenericRecord](KAFKA_AVRO_EVENT_TOPICS, kafkaParams))
val kafkaStreamParsed: DStream[Any] = kafkaStream.map(rdd => {
val eventJSON = avroToJsonString(rdd.value())
parseJsonStream(eventJSON)
})
try {
EsSparkStreaming.saveToEs(kafkaStreamParsed, ELASTICSEARCH_EVENTS_INDEX + "/" + ELASTICSEARCH_TYPE)
} catch {
case e: Exception =>
EsSparkStreaming.saveToEs(kafkaStreamParsed, ELASTICSEARCH_FAILED_EVENTS)
e.printStackTrace()
}
I have a dataframe let's say:
val someDF = Seq(
(8, "bat"),
(64, "mouse"),
(-27, "horse")
).toDF("number", "word")
I want to send that dataframe to a kafka topic using avro serialization and using schema registry. I believe I'm almost there, but I can't seem to get past the Task not serializable error. I understand there is a sink for kafka, but it doesn't communicate with the schema registry which is a requirement.
object Holder extends Serializable{
def prop(): java.util.Properties = {
val props = new Properties()
props.put("schema.registry.url", schemaRegistryURL)
props.put("key.serializer", classOf[KafkaAvroSerializer].getCanonicalName)
props.put("value.serializer", classOf[KafkaAvroSerializer].getCanonicalName)
props.put("schema.registry.url", schemaRegistryURL)
props.put("bootstrap.servers", brokers)
props
}
def vProps(props: java.util.Properties): kafka.utils.VerifiableProperties = {
val vProps = new kafka.utils.VerifiableProperties(props)
vProps
}
def messageSchema(vProps: kafka.utils.VerifiableProperties): org.apache.avro.Schema = {
val ser = new KafkaAvroEncoder(vProps)
val avro_schema = new RestService(schemaRegistryURL).getLatestVersion(subjectValueName)
val messageSchema = new Schema.Parser().parse(avro_schema.getSchema)
messageSchema
}
def avroRecord(messageSchema: org.apache.avro.Schema): org.apache.avro.generic.GenericData.Record = {
val avroRecord = new GenericData.Record(messageSchema)
avroRecord
}
def ProducerRecord(avroRecord:org.apache.avro.generic.GenericData.Record): org.apache.kafka.clients.producer.ProducerRecord[org.apache.avro.generic.GenericRecord,org.apache.avro.generic.GenericRecord] = {
val record = new ProducerRecord[GenericRecord, GenericRecord](topicWrite, avroRecord)
record
}
def producer(props: java.util.Properties): KafkaProducer[GenericRecord, GenericRecord] = {
val producer = new KafkaProducer[GenericRecord, GenericRecord](props)
producer
}
}
val prod: (String, String) => String = (
number: String,
word: String,
) => {
val prop = Holder.prop()
val vProps = Holder.vProps(prop)
val mSchema = Holder.messageSchema(vProps)
val aRecord = Holder.avroRecord(mSchema)
aRecord.put("number", number)
aRecord.put("word", word)
val record = Holder.ProducerRecord(aRecord)
val producer = Holder.producer(prop)
producer.send(record)
"sent"
}
val prodUDF: org.apache.spark.sql.expressions.UserDefinedFunction =
udf((
Number: String,
word: String,
) => prod(number,word))
val testDF = firstDF.withColumn("sent", prodUDF(col("number"), col("word")))
KafkaProducer is not serializable.
Create the KafkaProducer inside prod() instead of creating it outside.
I am executing a Spark Streaming job and I want to publish my result_dstream which is type of DStream[GenericData.Record] , so I used the code below for that purpose , but getting Task Not serializable Exception
val prod_props : Properties = new Properties()
prod_props.put("bootstrap.servers" , "localhost:9092")
prod_props.put("key.serializer" , "org.apache.kafka.common.serialization.StringSerializer")
prod_props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer")
val _producer : KafkaProducer[String , Array[Byte]] = new KafkaProducer(prod_props)
result_DStream.foreachRDD(r => {
r.foreachPartition(it => {
while(it.hasNext)
{
val schema = new Schema.Parser().parse(schema_string)
val recordInjection : Injection[GenericRecord , Array[Byte]] = GenericAvroCodecs.toBinary(schema)
val record : GenericData.Record = it.next()
val byte : Array[Byte] = recordInjection.apply(record)
val prod_record : ProducerRecord[String , Array[Byte]] = new ProducerRecord("sample_topic_name_9" , byte)
_producer.send(prod_record)
}
})
})
What can I do to solve this problem? I have tried suggestion like use the not serializable class in lambda function and use foreachPartition instead of foreach , and the problem is according to me is in schema or in recordInjection.
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2062)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:918)
at HadoopMetrics_Online$$anonfun$main$3.apply(HadoopMetrics_Online.scala:187)
at HadoopMetrics_Online$$anonfun$main$3.apply(HadoopMetrics_Online.scala:186)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.NotSerializableException: org.apache.kafka.clients.producer.KafkaProducer
Serialization stack:
- object not serializable (class: org.apache.kafka.clients.producer.KafkaProducer, value: org.apache.kafka.clients.producer.KafkaProducer#252f5489)
- field (class: HadoopMetrics_Online$$anonfun$main$3, name: _producer$1, type: class org.apache.kafka.clients.producer.KafkaProducer)
- object (class HadoopMetrics_Online$$anonfun$main$3, <function1>)
- field (class: HadoopMetrics_Online$$anonfun$main$3$$anonfun$apply$1, name: $outer, type: class HadoopMetrics_Online$$anonfun$main$3)
- object (class HadoopMetrics_Online$$anonfun$main$3$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more
KafkaProducer isn't serializable, and you're closing over it in your foreachPartition method. You'll need to declare it internally:
resultDStream.foreachRDD(r => {
r.foreachPartition(it => {
val producer : KafkaProducer[String , Array[Byte]] = new KafkaProducer(prod_props)
while(it.hasNext) {
val schema = new Schema.Parser().parse(schema_string)
val recordInjection : Injection[GenericRecord , Array[Byte]] = GenericAvroCodecs.toBinary(schema)
val record : GenericData.Record = it.next()
val byte : Array[Byte] = recordInjection.apply(record)
val prod_record : ProducerRecord[String , Array[Byte]] = new ProducerRecord("sample_topic_name_9" , byte)
producer.send(prod_record)
}
})
})
Side note - Scala naming conventions are camelCase for variable names, not snake_case.
code below:
def main(args: Array[String]) {
val sc = new SparkContext
val sec = Seconds(3)
val ssc = new StreamingContext(sc, sec)
ssc.checkpoint("./checkpoint")
val rdd = ssc.sparkContext.parallelize(Seq("a","b","c"))
val inputDStream = new ConstantInputDStream(ssc, rdd)
inputDStream.transform(rdd => {
val buf = ListBuffer[String]()
buf += "1"
buf += "2"
buf += "3"
val other_rdd = ssc.sparkContext.parallelize(buf) // create a new rdd
rdd.union(other_rdd)
}).print()
ssc.start()
ssc.awaitTermination()
}
and throw exception:
java.io.NotSerializableException: DStream checkpointing has been enabled but the DStreams with their functions are not serializable
org.apache.spark.streaming.StreamingContext
Serialization stack:
- object not serializable (class: org.apache.spark.streaming.StreamingContext, value: org.apache.spark.streaming.StreamingContext#5626e185)
- field (class: com.mirrtalk.Test$$anonfun$main$1, name: ssc$1, type: class org.apache.spark.streaming.StreamingContext)
- object (class com.mirrtalk.Test$$anonfun$main$1, <function1>)
- field (class: org.apache.spark.streaming.dstream.DStream$$anonfun$transform$1$$anonfun$apply$21, name: cleanedF$2, type: interface scala.Function1)
- object (class org.apache.spark.streaming.dstream.DStream$$anonfun$transform$1$$anonfun$apply$21, <function2>)
- field (class: org.apache.spark.streaming.dstream.DStream$$anonfun$transform$2$$anonfun$5, name: cleanedF$3, type: interface scala.Function2)
- object (class org.apache.spark.streaming.dstream.DStream$$anonfun$transform$2$$anonfun$5, <function2>)
- field (class: org.apache.spark.streaming.dstream.TransformedDStream, name: transformFunc, type: interface scala.Function2)
when I remove code ssc.checkpoint("./checkpoint"), the application can work well, but I need enable checkpoint.
how to fix this issue when enable checkpoint?
You can move context initialization and configuration tasks outside main:
object App {
val sc = new SparkContext(new SparkConf().setAppName("foo").setMaster("local"))
val sec = Seconds(3)
val ssc = new StreamingContext(sc, sec)
ssc.checkpoint("./checkpoint") // enable checkpoint
def main(args: Array[String]) {
val rdd = ssc.sparkContext.parallelize(Seq("a", "b", "c"))
val inputDStream = new ConstantInputDStream(ssc, rdd)
inputDStream.transform(rdd => {
val buf = ListBuffer[String]()
buf += "1"
buf += "2"
buf += "3"
val other_rdd = ssc.sparkContext.parallelize(buf)
rdd.union(other_rdd) // I want to union other RDD
}).print()
ssc.start()
ssc.awaitTermination()
}
}