Spark Kafka Task not serializable

Spark Kafka Task not serializable - apache-spark

I have run to a wall on getting around a Task not serializable when trying to break out a spark application into classes and use Try also.
The Code pulls from S3 for schema, does a streaming read from Kafka (which the topic is avro format with schema reg).
I have tried using the class and not using the class... but in both cases I'm getting a serz error relating to a closure.. which I guess something is being pulled in when it is trying to serz. This error haunts me always.. such a huge pain to get around. If someone could shed some light on how I can avoid this issue that would be awesome. These Java classes seem to have more issues than they are worth sometimes.
import java.util.Properties
import com.databricks.spark.avro._
import io.confluent.kafka.schemaregistry.client.rest.RestService
import io.confluent.kafka.serializers.{AbstractKafkaAvroSerDeConfig, KafkaAvroDecoder, KafkaAvroDeserializerConfig}
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.spark.sql.functions.{col, from_json}
import org.apache.spark.sql.streaming.StreamingQuery
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.util.{Failure, Success, Try}
case class DeserializedFromKafkaRecord(value: String)
class sparkS3() extends Serializable {
def readpeopleSchemaDF(spark: SparkSession, topicSchemaLocation: String): scala.util.Try[StructType] = {
val read: scala.util.Try[StructType] = Try(
spark
.read
.option("header", "true")
.format("com.databricks.spark.avro")
.load(topicSchemaLocation)
.schema
)
read
}
def writeTopicDF(peopleDFstream: DataFrame,
peopleDFstreamCheckpoint: String,
peopleDFstreamLocation: String): scala.util.Try[StreamingQuery] = {
val write: scala.util.Try[StreamingQuery] = Try(
peopleDFstream
.writeStream
.option("checkpointLocation", peopleDFstreamCheckpoint)
.format("com.databricks.spark.avro")
.option("path", peopleDFstreamLocation)
.start()
)
write
}
}
class sparkKafka() extends Serializable {
def readpeopleTopicDF(spark: SparkSession, topicSchema: StructType): scala.util.Try[DataFrame] = {
val brokers = "URL:9092"
val schemaRegistryURL = "URL:8081"
val kafkaParams = Map[String, String](
"kafka.bootstrap.servers" -> brokers,
"key.deserializer" -> "KafkaAvroDeserializer",
"value.deserializer" -> "KafkaAvroDeserializer",
"group.id" -> "structured-kafka",
//"auto.offset.reset" -> "latest",
"failOnDataLoss" -> "false",
"schema.registry.url" -> schemaRegistryURL
)
var kafkaTopic = "people"
object avroDeserializerWrapper {
val props = new Properties()
props.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryURL)
props.put(KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG, "true")
val vProps = new kafka.utils.VerifiableProperties(props)
val deser = new KafkaAvroDecoder(vProps)
val avro_schema = new RestService(schemaRegistryURL).getLatestVersion(kafkaTopic + "-value")
val messageSchema = new Schema.Parser().parse(avro_schema.getSchema)
}
import spark.implicits._
val read: scala.util.Try[DataFrame] = Try(
{
val peopleStringDF = {
spark
.readStream
.format("kafka")
.option("subscribe", kafkaTopic)
.option("kafka.bootstrap.servers", brokers)
.options(kafkaParams)
.load()
.map(x => {
DeserializedFromKafkaRecord(avroDeserializerWrapper.deser.fromBytes(
x
.getAs[Array[Byte]]("value"), avroDeserializerWrapper.messageSchema)
.asInstanceOf[GenericData.Record].toString)
})
}
val peopleJsonDF = {
peopleStringDF
.select(
from_json(col("value")
.cast("string"), topicSchema)
.alias("people"))
}
peopleJsonDF.select("people.*")
})
read
}
}
object peopleDataLakePreprocStage1 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("peoplePreProcConsumerStage1")
.getOrCreate()
val topicSchemaLocation = "URL"
val topicDFstreamCheckpoint = "URL"
val topicDFstreamLocation = "URL"
val sparkKafka = new sparkKafka()
val sparkS3 = new sparkS3()
sparkS3.readpepleSchemaDF(spark, topicSchemaLocation) match {
case Success(topicSchema) => {
sparkKafka.readpeopletTopicDF(spark, topicSchema) match {
case Success(df) => {
sparkS3.writeTopicDF(df, topicDFstreamCheckpoint, topicDFstreamLocation) match {
case Success(query) => {
query.awaitTermination()
}
case Failure(f) => println(f)
}
}
case Failure(f) => println(f)
}
}
case Failure(f) => println(f)
}
}
}
Here is the error
java.lang.IllegalStateException: s3a://... when compacting batch 9 (compactInterval: 10)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:173)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:172)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.NumericRange.foreach(NumericRange.scala:73)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.compact(CompactibleFileStreamLog.scala:172)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.add(CompactibleFileStreamLog.scala:156)
at org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol.commitJob(ManifestFileCommitProtocol.scala:64)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:213)
at org.apache.spark.sql.execution.streaming.FileStreamSink.addBatch(FileStreamSink.scala:123)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
18/08/10 13:04:07 ERROR MicroBatchExecution: Query [id = 2876ded4-f223-40c4-8634-0c8feec94bf6, runId = 9b9a1347-7a80-4295-bb6e-ff2de18eeaf4] terminated with error
org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:224)
at org.apache.spark.sql.execution.streaming.FileStreamSink.addBatch(FileStreamSink.scala:123)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3$$anonfun$apply$16.apply(MicroBatchExecution.scala:477)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$3.apply(MicroBatchExecution.scala:475)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:474)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:133)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:271)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:121)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:117)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
Caused by: java.lang.IllegalStateException: s3a://..../_spark_metadata/0 doesn't exist when compacting batch 9 (compactInterval: 10)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4$$anonfun$apply$1.apply(CompactibleFileStreamLog.scala:174)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:173)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog$$anonfun$4.apply(CompactibleFileStreamLog.scala:172)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.NumericRange.foreach(NumericRange.scala:73)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.compact(CompactibleFileStreamLog.scala:172)
at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.add(CompactibleFileStreamLog.scala:156)
at org.apache.spark.sql.execution.streaming.ManifestFileCommitProtocol.commitJob(ManifestFileCommitProtocol.scala:64)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:213)
... 17 more

The resolution was one of two (or both) things.. extend serialzation on the class , separate files in the same namespace. I have updated the code above to reflect

Just a stab. In class sparkS3 you are using 'var' to define those values - did you mean 'val'?

Related

why does spark return "task not serializable" if i run this code?

I have written one simple code in spark.
That is getting the file location from the dataframe columns and returns the string whether it is exist or not.
But once i run this it will throw a "task not serializable".
Can someone please help me to get out of this error?
object filetospark{
def main(args: Array[String]) : Unit = {
val spark = SparkSession
.builder()
.appName("app1")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val path: String => String = (Path: String) => {
val exists = fs.exists(new Path(Path))
var result = " "
if (exists) {
result = "Y"
}
else {
result = "N"
}
result
}
val PATH = udf(path)
val config_df=spark.read.
option("header","true").
option("inferSchema","true").
csv("pathlocation")
val current_date=LocalDate.now()
val instance_table_df=instance_df.withColumn("is_available",PATH(col("file_name")))
error like this
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:403)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:393)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:162)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:850)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:849)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:849)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:613)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:247)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:339)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3384)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2545)
at org.apache.spark.sql.Dataset$$anonfun$53.apply(Dataset.scala:3365)
at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3364)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2545)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2759)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:255)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:292)
at org.apache.spark.sql.Dataset.show(Dataset.scala:746)
at org.apache.spark.sql.Dataset.show(Dataset.scala:705)
at org.apache.spark.sql.Dataset.show(Dataset.scala:714)
at filetospark$.main(filetospark.scala:40)
at filetospark.main(filetospark.scala)
Caused by: java.io.NotSerializableException: org.apache.hadoop.fs.LocalFileSystem
Serialization stack:
- object not serializable (class: org.apache.hadoop.fs.LocalFileSystem, value: org.apache.hadoop.fs.LocalFileSystem#7fd3fd06)
- field (class: filetospark$$anonfun$1, name: fs$1, type: class org.apache.hadoop.fs.FileSystem)
- object (class filetospark$$anonfun$1, <function1>)
- element of array (index: 4)
- array (class [Ljava.lang.Object;, size 5)
- field (class: org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11, name: references$1, type: class [Ljava.lang.Object;)
- object (class org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$11, <function2>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:400)
... 36 more
It shows this error someone could please solve this problem

object filetospark{
val spark = SparkSession
.builder()
.appName("app1")
.master("local")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val path: String => String = (Path: String) => {
val exists = fs.exists(new Path(Path))
var result = " "
if (exists) {
result = "Y"
}
else {
print("N")
result = "N"
}
result
}
def main(args: Array[String]) : Unit = {
val PATH = udf(path)
val newfu=udf(newfun)
val config_df=spark.read.
option("header","true").
option("inferSchema","true").
csv("filepath")
val current_date=LocalDate.now()
val instance_table_df=instance_df.withColumn("is_available",PATH(col("file_name")))
instance_table_df.show()
}
}
I don't know what is happening here.Now that error was cleared.But my doubt is still here.
I just create the spark session outside the main function.it works fine.But i dont know what is happening here.If any one knows please post here.

How pass Basic Authentication to Confluent Schema Registry?

I want to read data from a confluent cloud topic and then write in another topic.
At localhost, I haven't had any major problems. But the schema registry of confluent cloud requires to pass some authentication data that I don't know how to enter them:
basic.auth.credentials.source=USER_INFO
schema.registry.basic.auth.user.info=:
schema.registry.url=https://xxxxxxxxxx.confluent.cloudBlockquote
Below is the current code:
import com.databricks.spark.avro.SchemaConverters
import io.confluent.kafka.schemaregistry.client.{CachedSchemaRegistryClient, SchemaRegistryClient}
import io.confluent.kafka.serializers.AbstractKafkaAvroDeserializer
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.spark.sql.SparkSession
object AvroConsumer {
private val topic = "transactions"
private val kafkaUrl = "http://localhost:9092"
private val schemaRegistryUrl = "http://localhost:8081"
private val schemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128)
private val kafkaAvroDeserializer = new AvroDeserializer(schemaRegistryClient)
private val avroSchema = schemaRegistryClient.getLatestSchemaMetadata(topic + "-value").getSchema
private var sparkSchema = SchemaConverters.toSqlType(new Schema.Parser().parse(avroSchema))
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("ConfluentConsumer")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark.udf.register("deserialize", (bytes: Array[Byte]) =>
DeserializerWrapper.deserializer.deserialize(bytes)
)
val kafkaDataFrame = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", kafkaUrl)
.option("subscribe", topic)
.load()
val valueDataFrame = kafkaDataFrame.selectExpr("""deserialize(value) AS message""")
import org.apache.spark.sql.functions._
val formattedDataFrame = valueDataFrame.select(
from_json(col("message"), sparkSchema.dataType).alias("parsed_value"))
.select("parsed_value.*")
formattedDataFrame
.writeStream
.format("console")
.option("truncate", false)
.start()
.awaitTermination()
}
object DeserializerWrapper {
val deserializer = kafkaAvroDeserializer
}
class AvroDeserializer extends AbstractKafkaAvroDeserializer {
def this(client: SchemaRegistryClient) {
this()
this.schemaRegistry = client
}
override def deserialize(bytes: Array[Byte]): String = {
val genericRecord = super.deserialize(bytes).asInstanceOf[GenericRecord]
genericRecord.toString
}
}
}
I think I have to pass this authentication data to CachedSchemaRegistryClient but I'm not sure if so and how.

I've finally been able to pass the properties.
I leave the lines that gave the solution.
val restService = new RestService(schemaRegistryURL)
val props = Map(
"basic.auth.credentials.source" -> "USER_INFO",
"schema.registry.basic.auth.user.info" -> "secret:secret"
).asJava
var schemaRegistryClient = new CachedSchemaRegistryClient(restService, 100, props)

Kudu Client fails with exceptions after running for few days

I have a Scala/Spark/Kafka process that I run. When I first start the process I create a KuduClient Object using a function I made that I share between classes. For this job I only create the KuduClient once, and let the process run continuously. I've noticed that after several days I frequently get exceptions.
I'm not really sure what to do. I think maybe an option would be to create a new Kudu client every day or so but I'm unsure of how to do that in this case as well.
import org.apache.spark.SparkConf
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.json.JSONObject
import org.apache.kudu.client.KuduClient
import org.apache.log4j.Logger
object Thing extends Serializable {
#transient lazy val client: KuduClient = createKuduClient(config)
#transient lazy val logger: Logger = Logger.getLogger(getClass.getName)
def main(args: Array[String]) {
UtilFunctions.loadConfig(args) //I send back a config object.
UtilFunctions.loadLogger() //factory method to load logger
val props: Map[String, String] = setKafkaProperties()
val topic = Set(config.getString("config.TOPIC_NAME"))
val conf = new SparkConf().setMaster("local[2]").setAppName(config.getString("config.SPARK_APP_NAME"))
val ssc = new StreamingContext(conf, Seconds(10))
ssc.sparkContext.setLogLevel("ERROR")
ssc.checkpoint(config.getString("config.SPARK_CHECKPOINT_NAME"))
// val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, props, topic)
val kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topic, props))
val distRecordsStream = kafkaStream.map(record => (record.key(), record.value()))
distRecordsStream.window(Seconds(10), Seconds(10))
distRecordsStream.foreachRDD(distRecords => {
logger.info(distRecords + " : " + distRecords.count())
distRecords.foreach(record => {
logger.info(record._2)
MyClass.DoSomethingWithThisData(new JSONObject(record._2), client)
})
})
ssc.start()
ssc.awaitTermination()
}
def createKuduClient(config: Config): KuduClient = {
var client: KuduClient = null
try{
client = new KuduClient.KuduClientBuilder(config.getString("config.KUDU_MASTER"))
.defaultAdminOperationTimeoutMs(config.getInt("config.KUDU_ADMIN_TIMEOUT_S") * 1000)
.defaultOperationTimeoutMs(config.getInt("config.KUDU_OPERATION_TIMEOUT_S") * 1000)
.build()
}
catch {
case e: Throwable =>
logger.error(e.getMessage)
logger.error(e.getStackTrace.toString)
Thread.sleep(10000) //try to create a new kudu client
client = createKuduClient(config)
}
client //return
}
def setKafkaProperties(): Map[String, String] = {
val zookeeper = config.getString("config.ZOOKEEPER")
val offsetReset = config.getString("config.OFFSET_RESET")
val brokers = config.getString("config.BROKERS")
val groupID = config.getString("config.GROUP_ID")
val deserializer = config.getString("config.DESERIALIZER")
val autoCommit = config.getString("config.AUTO_COMMIT")
val maxPollRecords = config.getString("config.MAX_POLL_RECORDS")
val maxPollIntervalms = config.getString("config.MAX_POLL_INTERVAL_MS")
val props = Map(
"bootstrap.servers" -> brokers,
"zookeeper.connect" -> zookeeper,
"group.id" -> groupID,
"key.deserializer" -> deserializer,
"value.deserializer" -> deserializer,
"enable.auto.commit" -> autoCommit,
"auto.offset.reset" -> offsetReset,
"max.poll.records" -> maxPollRecords,
"max.poll.interval.ms" -> maxPollIntervalms)
props
}
}
Exceptions below. I've removed the IP address inplace of using "x"
ERROR client.TabletClient: [Peer
master-ip-xxx-xx-xxx-40.ec2.internal:7051] Unexpected exception from
downstream on [id: 0x42ba3f4d, /xxx.xx.xxx.39:36820 =>
ip-xxx-xxx-xxx-40.ec2.internal/xxx.xx.xxx.40:7051]
java.lang.RuntimeException: Could not deserialize the response,
incompatible RPC? Error is: step
at org.apache.kudu.client.KuduRpc.readProtobuf(KuduRpc.java:383)
at org.apache.kudu.client.Negotiator.parseSaslMsgResponse(Negotiator.java:282)
at org.apache.kudu.client.Negotiator.handleResponse(Negotiator.java:235)
at org.apache.kudu.client.Negotiator.messageReceived(Negotiator.java:229)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.timeout.ReadTimeoutHandler.messageReceived(ReadTimeoutHandler.java:184)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.oneone.OneToOneDecoder.handleUpstream(OneToOneDecoder.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:462)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.callDecode(FrameDecoder.java:443)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.messageReceived(FrameDecoder.java:310)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline$DefaultChannelHandlerContext.sendUpstream(DefaultChannelPipeline.java:791)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:296)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.unfoldAndFireMessageReceived(FrameDecoder.java:462)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.callDecode(FrameDecoder.java:443)
at org.apache.kudu.client.shaded.org.jboss.netty.handler.codec.frame.FrameDecoder.messageReceived(FrameDecoder.java:303)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.SimpleChannelUpstreamHandler.handleUpstream(SimpleChannelUpstreamHandler.java:70)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:564)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.DefaultChannelPipeline.sendUpstream(DefaultChannelPipeline.java:559)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:268)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.Channels.fireMessageReceived(Channels.java:255)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.NioWorker.read(NioWorker.java:88)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioWorker.process(AbstractNioWorker.java:108)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:337)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:89)
at org.apache.kudu.client.shaded.org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
at org.apache.kudu.client.shaded.org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
at org.apache.kudu.client.shaded.org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I've also seen exceptions like these after running for a while which others seem to attribute to being the open file handles limit of your user.
java.io.IOException: All datanodes
DatanodeInfoWithStorage[xxx.xx.xxx.36:1004,DS-55c403c3-203a-4dac-b383-72fcdb686185,DISK]
are bad. Aborting...
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.setupPipelineForAppendOrRecovery(DFSOutputStream.java:1465)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.processDatanodeError(DFSOutputStream.java:1236)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputSt
Is this have something too do with having too many open files? A way to "purge" these files once they reach a limit?

Error while kafka spark connection

I try to connect kafka to spark. I use kafka_2.11-0.11.0.1 and spark 2.2.0. I included jar files as:
kafka_2.11-0.11.0.1
kafka-clients-0.11.0.1
spark-streaming_2.11-2.2.0
spark-streaming-kafka_2.11-2.2.0
and here is my code:
import org.apache.spark._
import org.apache.spark.streaming.dstream._
import org.apache.spark.streaming.kafka._
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka
import org.apache.spark.streaming.kafka.KafkaUtils
object KafkaExample {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("KafkaExample").setMaster("local[*]")
val ssc = new StreamingContext(sparkConf, Seconds(20))
val kafkaParams = Map("metadata.broker.list" -> "kafkaIP:9092")
val topics = Set("logstash_log")
val stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc,kafkaParams,topics)
stream.print()
ssc.checkpoint("C:/checkpoint/")
ssc.start()
ssc.awaitTermination()
}
}
I got this response and couldn't find the solution anywhere:
Exception in thread "main" java.lang.NoSuchMethodError: kafka.api.TopicMetadata.errorCode()S
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1$$anonfun$4.apply(KafkaCluster.scala:127)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1$$anonfun$4.apply(KafkaCluster.scala:127)
at scala.collection.TraversableLike$$anonfun$filterImpl$1.apply(TraversableLike.scala:248)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.filterImpl(TraversableLike.scala:247)
at scala.collection.TraversableLike$class.filter(TraversableLike.scala:259)
at scala.collection.AbstractTraversable.filter(Traversable.scala:104)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1.apply(KafkaCluster.scala:127)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$getPartitionMetadata$1.apply(KafkaCluster.scala:125)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers$1.apply(KafkaCluster.scala:346)
at org.apache.spark.streaming.kafka.KafkaCluster$$anonfun$org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers$1.apply(KafkaCluster.scala:342)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
at org.apache.spark.streaming.kafka.KafkaCluster.org$apache$spark$streaming$kafka$KafkaCluster$$withBrokers(KafkaCluster.scala:342)
at org.apache.spark.streaming.kafka.KafkaCluster.getPartitionMetadata(KafkaCluster.scala:125)
at org.apache.spark.streaming.kafka.KafkaCluster.getPartitions(KafkaCluster.scala:112)
at org.apache.spark.streaming.kafka.KafkaUtils$.getFromOffsets(KafkaUtils.scala:211)
at org.apache.spark.streaming.kafka.KafkaUtils$.createDirectStream(KafkaUtils.scala:484)
at com.defne.KafkaExample$.main(KafkaExample.scala:27)
at com.defne.KafkaExample.main(KafkaExample.scala)
Why does this occur? How can I handle this? Any help will be greatly appreciated!
Thanks.

It may be helpful. You can modify based on your dataset and IP address
def StreamingFromKafkaMain(): Unit =
{
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "192.168.34.216:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("myTopicName")
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("KafkaTest")
val streamingContext = new StreamingContext(sparkConf, Seconds(1))
// Create a input direct stream
val kafkaStream = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val spark = SparkSession.builder().master("local[*]").appName("KafkaTest").getOrCreate()
val items = kafkaStream.map(record => (record.key, record.value.split("\n")))
val itemStatus = items.map(status => status.toString())
items.foreachRDD(
rddm => if (!rddm.isEmpty()) {
//val my_dataset=StreamingFromKafkaOracleMain();
println("Test")
//my_dataset.show
//val df1 = rddm.map(_.mkString(",")).map(x=> schema(x(0).toString,x(1).toInt,x(2).toString)).toDF()
val splittedRdd =rddm.map(line=>line.toString().split(","))
println(splittedRdd.take(10))
}
)
streamingContext.start()
streamingContext.awaitTermination()
}

UnaryTransformer instance throwing ClassCastException

I have a requirement to create my own UnaryTransformer instance that accepts a Dataframe Column of type Array[String] and should also output the same type.In trying to do so,I encountered a ClassCastException on my Spark version 2.1.0.
I've put together a sample test that shows my case.
import org.apache.spark.SparkConf
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
class MyTransformer(override val uid:String) extends UnaryTransformer[Array[String],Array[String],MyTransformer] {
override protected def createTransformFunc: (Array[String]) => Array[String] = {
param1 => {
param1.foreach(println(_))
param1
}
}
override protected def outputDataType: DataType = ArrayType(StringType)
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == ArrayType(StringType), s"Data type mismatch between Array[String] and provided type $inputType.")
}
def this() = this( Identifiable.randomUID("tester") )
}
object Tester {
def main(args: Array[String]): Unit = {
val config = new SparkConf().setAppName("Tester")
implicit val sparkSession = SparkSession.builder().config(config).getOrCreate()
import sparkSession.implicits._
val dataframe = Seq(Array("Firstly" , "F1"),Array("Driving" , "S1" ),Array("Ran" , "T3" ),Array("Fourth" ,"F4"), Array("Running" , "F5")
,Array("Gone" , "S6")).toDF("input")
val transformer = new MyTransformer().setInputCol("input").setOutputCol("output")
val transformed = transformer.transform(dataframe)
transformed.select("output").show()
println("Complete....")
sparkSession.close()
}
}
Attaching the stack trace for reference
Exception in thread "main" org.apache.spark.SparkException: Failed to
execute user defined function($anonfun$createTransformFunc$1:
(array) => array) at
org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1072)
at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:144)
at
org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:48)
at
org.apache.spark.sql.catalyst.expressions.InterpretedProjection.apply(Projection.scala:30)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at
scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.immutable.List.foreach(List.scala:392) at
scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.immutable.List.map(List.scala:296) at
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$21.applyOrElse(Optimizer.scala:1078)
at
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$$anonfun$apply$21.applyOrElse(Optimizer.scala:1073)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:287)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
at
org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:277)
at
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$.apply(Optimizer.scala:1073)
at
org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation$.apply(Optimizer.scala:1072)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
at
scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
at
scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
at
scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
at scala.collection.immutable.List.foreach(List.scala:392) at
org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
at
org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:73)
at
org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:73)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:79)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:75)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:84)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:84)
at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2791)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2112) at
org.apache.spark.sql.Dataset.take(Dataset.scala:2327) at
org.apache.spark.sql.Dataset.showString(Dataset.scala:248) at
org.apache.spark.sql.Dataset.show(Dataset.scala:636) at
org.apache.spark.sql.Dataset.show(Dataset.scala:595) at
org.apache.spark.sql.Dataset.show(Dataset.scala:604) at
Tester$.main(Tester.scala:45) at Tester.main(Tester.scala)
Caused by: java.lang.ClassCastException: scala.collection.mutable.WrappedArray$ofRef cannot be cast to
[Ljava.lang.String; at
MyTransformer$$anonfun$createTransformFunc$1.apply(Tester.scala:9)
at
org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2.apply(ScalaUDF.scala:89)
at
org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2.apply(ScalaUDF.scala:88)
at
org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1069)
... 53 more

ArrayType is represented as Seq not Array:
override protected def createTransformFunc: (Seq[String]) => Seq[String] = {
param1 => {
param1.foreach(println(_))
param1
}
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Spark Kafka Task not serializable - apache-spark

The resolution was one of two (or both) things.. extend serialzation on the class , separate files in the same namespace. I have updated the code above to reflect

Just a stab. In class sparkS3 you are using 'var' to define those values - did you mean 'val'?

Related

why does spark return "task not serializable" if i run this code?

How pass Basic Authentication to Confluent Schema Registry?

Kudu Client fails with exceptions after running for few days

Error while kafka spark connection

UnaryTransformer instance throwing ClassCastException

Categories

Resources