Spark3.0 failed to write stream from cosmos changefeed to snowflake - apache-spark

I use:
<dependency>
<groupId>com.azure.cosmos.spark</groupId>
<artifactId>azure-cosmos-spark_3-1_2-12</artifactId>
<version>4.4.0</version>
</dependency>
to read stream from cosmos changefeed, but I run faild because of
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.execution.streaming.MetadataVersionUtil$
at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
at java.lang.ClassLoader.loadClass(ClassLoader.java:419)
at java.lang.ClassLoader.loadClass(ClassLoader.java:352)
I found that MetadataVersionUtil is a new class in spark3.1, and I can't find a jar suitable for spark3.0, such as azure-cosmos-spark_3-0_2-12.
So I clone https://github.com/Azure/azure-cosmosdb-spark.git and checkout 3.0 branch, and use mvn install to release the jar. But when I use this new jar, it failed because of:
Exception in thread "main" java.lang.ClassNotFoundException: Failed to find data source: cosmos.oltp.changeFeed. Please find packages at http://spark.apache.org/third-party-projects.html
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:689)
at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:209)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:195)
at com.fti.cosmos.StreamToSnowflake$.main(StreamToSnowflake.scala:79)
at com.fti.cosmos.StreamToSnowflake.main(StreamToSnowflake.scala)
Caused by: java.lang.ClassNotFoundException: cosmos.oltp.changeFeed.DefaultSource
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:355)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:663)
at scala.util.Try$.apply(Try.scala:213)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:663)
at scala.util.Failure.orElse(Try.scala:224)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:663)
... 4 more
22/02/25 17:15:56 INFO SparkContext: Invoking stop() from shutdown hook
the scala code is:
val readConfig = Map(
"spark.cosmos.accountEndpoint" -> s"${cosmosHost}:${cosmosPort}",
"spark.cosmos.accountKey" -> cosmosPassword,
"spark.cosmos.database" -> cosmosSourceDB,
"spark.cosmos.container" -> cosmosSourceCollection,
"spark.cosmos.read.partitioning.strategy" -> "Default",
"spark.cosmos.read.inferSchema.enabled" -> "false",
"spark.cosmos.changeFeed.startFrom" -> cosmosCfStartFrom,
"spark.cosmos.changeFeed.mode" -> "Incremental"
)
// init spark
val ss = SparkSession.builder
.appName(s"${this.getClass.getName}-${cosmosSourceDB}-${cosmosSourceCollection}")
.master("local")
.getOrCreate()
val df = ss.readStream.format("cosmos.oltp.changeFeed").options(readConfig).load()
val sfOptions = Map(
"sfURL" -> "",
"sfUser" -> "",
"sfRole" -> s"",
"pem_private_key" -> "",
"sfDatabase" -> s"",
"sfSchema" -> sfSchema,
"sfWarehouse" -> sfWarehouse
)
df.writeStream.trigger(Trigger.ProcessingTime("30 seconds"))
.foreachBatch((ds: DataFrame, batchId: Long) => {
ds.toDF().show(false)
ds.toDF().write
.format(SNOWFLAKE_SOURCE_NAME)
.options(sfOptions)
.option("dbtable", targetTable)
.mode(SaveMode.Append)
.save()
})
.option("checkpointLocation", f"wasbs://....")
.outputMode("append")
.start()
.awaitTermination()
Has anyone successfully synced cosmos's changefeed with spark3.0, I don't want to upgrade HdInsight to 3.1 anymore.

Related

NoSuchMethodError trying to ingest HDFS data into Elasticsearch

I'm using Spark 3.12, Scala 2.12, Hadoop 3.1.1.3.1.2-50, Elasticsearch 7.10.1 (due to license issues), Centos 7
to try an ingest json data in gzip files located on HDFS into Elasticsearch using spark streaming.
I get a
Logical Plan:
FileStreamSource[hdfs://pct/user/papago-mlops-datalake/raw/mt-log/engine=n2mt/year=2022/date=0430/hour=00]
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:356)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:244)
Caused by: java.lang.NoSuchMethodError: org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(Lorg/apache/spark/sql/SparkSession;Lorg/apache/spark/sql/execution/QueryExecution;Lscala/Function0;)Ljava/lang/Object;
at org.elasticsearch.spark.sql.streaming.EsSparkSqlStreamingSink.addBatch(EsSparkSqlStreamingSink.scala:62)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:586)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$15(MicroBatchExecution.scala:584)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:584)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:226)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:194)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:188)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:334)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:317)
... 1 more
ApplicationMaster host: ac3m8x2183.bdp.bdata.ai
ApplicationMaster RPC port: 39673
queue: batch
start time: 1654588583366
final status: FAILED
tracking URL: https://gemini-rm2.bdp.bdata.ai:9090/proxy/application_1654575947385_29572/
user: papago-mlops-datalake
Exception in thread "main" org.apache.spark.SparkException: Application application_1654575947385_29572 finished with failed status
at org.apache.spark.deploy.yarn.Client.run(Client.scala:1269)
at org.apache.spark.deploy.yarn.YarnClusterApplication.start(Client.scala:1627)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:904)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:198)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:228)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:137)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
using
implementation("org.elasticsearch:elasticsearch-hadoop:8.2.2")
implementation("com.typesafe:config:1.4.2")
implementation("org.apache.spark:spark-sql_2.12:3.1.2")
testImplementation("org.scalatest:scalatest_2.12:3.2.12")
testRuntimeOnly("com.vladsch.flexmark:flexmark-all:0.61.0")
compileOnly("org.apache.spark:spark-sql_2.12:3.1.2")
compileOnly("org.apache.spark:spark-core_2.12:3.1.2")
compileOnly("org.apache.spark:spark-launcher_2.12:3.1.2")
compileOnly("org.apache.spark:spark-streaming_2.12:3.1.2")
compileOnly("org.elasticsearch:elasticsearch-spark-30_2.12:8.2.2")
libraries. I tried using ES-Hadoop version 7.10.1, but ES-Spark only supports down to 7.12.0 for Spark 3.0 and I still get the same error.
My code is pretty simple
def main(args: Array[String]): Unit = {
// Set the log level to only print errors
Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession
.builder()
.config(ConfigurationOptions.ES_NET_HTTP_AUTH_USER, elasticsearchUser)
.config(ConfigurationOptions.ES_NET_HTTP_AUTH_PASS, elasticsearchPass)
.config(ConfigurationOptions.ES_NODES, elasticsearchHost)
.config(ConfigurationOptions.ES_PORT, elasticsearchPort)
.appName(appName)
.master(master)
.getOrCreate()
val streamingDF: DataFrame = spark.readStream
.schema(jsonSchema)
.format("org.apache.spark.sql.execution.datasources.json.JsonFileFormat")
.load(pathToJSONResource)
streamingDF.writeStream
.outputMode(outputMode)
.format(destination)
.option("checkpointLocation", checkpointLocation)
.start(indexAndDocType)
.awaitTermination()
// Stop the session
spark.stop()
}
}
If I can't use the ES-Hadoop libraries is there another way I can go about ingesting JSON into ES from HDFS?

Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/spark/sql/SparkSession$ : Redis Client

I am using redis client for storing my constants for a project but when I am reading executing the following command and using this into sparksession. Can any body help me out in this. its a maven project
and I am using the following maven dependency
val sparkSession=SparkSession.builder.appName(getAppName).enableHiveSupport().getOrCreate()
sparkSession
val r = new RedisClient("localhost", 6379)
def getAppName:String={
val a=r.get("xyz")
val v=new String(a.get)
logger.info("******** xyz ************"+ v)
v
}
I am getting following error:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/spark/sql/SparkSession$
at com.hpe.cdp.common.SparkCommon$.createSparkSession(abc.scala:14)
at com.hpe.cdp.common.SparkCommon$.readHiveDataDelta(abc.scala:20)
at com.hpe.cdp.common.CdpEapObjectCall$.saveDataCsvHiveDelta(def.scala:16)
at com.hpe.cdp.CdpEapObject$.main(def.scala:16)
at com.hpe.cdp.CdpEapObject.main(def.scala)
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.SparkSession$
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:355)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 5 more
<!-- https://mvnrepository.com/artifact/org.redis/scala-redis -->
<dependency>
<groupId>org.redis</groupId>
<artifactId>scala-redis_2.11</artifactId>
<version>0.0.13</version>
</dependency>

I save a DataFrame in Hbase and I get: java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/client/TableDescriptor

I created a project on Apache Spark.
Version:
scala 2.11.8
apache spark 2.3.0
apache hbase 1.2.0
hortonworks shc 1.1.0.3.1.2.0-4 (the hortonworks connector)
I need to save a simple DataFrame in an HBase table. For this I started HBase 1.2.0 in Docker container (https://github.com/zhao-y/docker-hbase-pseudo) and created the following table:
$ hbase(main):002:0> create "table1", "cf1", "cf2", "cf3", "cf4", "cf5", "cf6", "cf7", "cf8"
$ 0 row (s) in 1.4440 seconds
To save a DataFrame in Hbase I use: https://github.com/hortonworks-spark/shc
I declared the catalog exactly as in the example
I created a catalog-based dataframe
I tried to save dataframe in hbase as in example:
dataFrame.write.options(
Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5"))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()
Code:
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.junit.Test
class SparkTest {
case class HBaseRecord(
col0: String,
col1: Boolean,
col2: Double,
col3: Float,
col4: Int,
col5: Long,
col6: Short,
col7: String,
col8: Byte)
object HBaseRecord {
def apply(i: Int, t: String): HBaseRecord = {
val s = s"""row${"%03d".format(i)}"""
HBaseRecord(s,
i % 2 == 0,
i.toDouble,
i.toFloat,
i,
i.toLong,
i.toShort,
s"String$i: $t",
i.toByte)
}
}
#Test
def bar(): Unit = {
val sparkSession = SparkSession.builder
.appName("SparkTest")
.master("local[*]")
.config("spark.testing.memory", 2147480000)
.getOrCreate()
val data = (0 to 255).map { i => HBaseRecord(i, "extra") }
val dataFrame = sparkSession.createDataFrame(data)
dataFrame.show
dataFrame.write.options(
Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5"))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()
}
}
Error:
java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/client/TableDescriptor
at org.apache.spark.sql.execution.datasources.hbase.DefaultSource.createRelation(HBaseRelation.scala:63)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
at SparkTest.bar(SparkTest.scala:56)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.junit.internal.runners.TestMethod.invoke(TestMethod.java:59)
at org.junit.internal.runners.MethodRoadie.runTestMethod(MethodRoadie.java:98)
at org.junit.internal.runners.MethodRoadie$2.run(MethodRoadie.java:79)
at org.junit.internal.runners.MethodRoadie.runBeforesThenTestThenAfters(MethodRoadie.java:87)
at org.junit.internal.runners.MethodRoadie.runTest(MethodRoadie.java:77)
at org.junit.internal.runners.MethodRoadie.run(MethodRoadie.java:42)
at org.junit.internal.runners.JUnit4ClassRunner.invokeTestMethod(JUnit4ClassRunner.java:88)
at org.junit.internal.runners.JUnit4ClassRunner.runMethods(JUnit4ClassRunner.java:51)
at org.junit.internal.runners.JUnit4ClassRunner$1.run(JUnit4ClassRunner.java:44)
at org.junit.internal.runners.ClassRoadie.runUnprotected(ClassRoadie.java:27)
at org.junit.internal.runners.ClassRoadie.runProtected(ClassRoadie.java:37)
at org.junit.internal.runners.JUnit4ClassRunner.run(JUnit4ClassRunner.java:42)
at org.junit.runner.JUnitCore.run(JUnitCore.java:130)
at com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68)
at com.intellij.rt.execution.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:47)
at com.intellij.rt.execution.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:242)
at com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:70)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.hbase.client.TableDescriptor
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:349)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 41 more
val sparkSession = SparkSession.builder
.appName("SparkTest")
.master("local[*]")
.config("spark.testing.memory", 2147480000)
.getOrCreate()
means you are running that in local and your hbase client jar is missing. (if its there in classpath then you can change the scope to runtime rather than compile)
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-client -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.1.4</version>
</dependency>
if you are using intellij to run locally, you can see hbase client jar is present in the .iml file.
normal way of runnning in cluster or client modes(not local) would be hbase claasspath add it to
export HBASE_CLASSPATH=$HBASE_CLASSPATH:`hbase classpath`
which will add all the hbase jars in to the classpath
to see/print all the jars in classpath below will be helpful to understand which jars in your classpath.
def urlsinclasspath(cl: ClassLoader): Array[java.net.URL] = cl match {
case null => Array()
case u: java.net.URLClassLoader => u.getURLs() ++ urlsinclasspath(cl.getParent)
case _ => urlsinclasspath(cl.getParent)
}
Caller would be...
val urls = urlsinclasspath(getClass.getClassLoader).foreach(println)

Spark streaming and kafka Missing required configuration "partition.assignment.strategy" which has no default value

I am trying to run the spark streaming application with Kafka using yarn. I am getting the following Stack trace error-
Caused by: org.apache.kafka.common.config.ConfigException: Missing required configuration "partition.assignment.strategy" which has no default value.
at org.apache.kafka.common.config.ConfigDef.parse(ConfigDef.java:124)
at org.apache.kafka.common.config.AbstractConfig.(AbstractConfig.java:48)
at org.apache.kafka.clients.consumer.ConsumerConfig.(ConsumerConfig.java:194)
at org.apache.kafka.clients.consumer.KafkaConsumer.(KafkaConsumer.java:380)
at org.apache.kafka.clients.consumer.KafkaConsumer.(KafkaConsumer.java:363)
at org.apache.kafka.clients.consumer.KafkaConsumer.(KafkaConsumer.java:350)
at org.apache.spark.streaming.kafka010.CachedKafkaConsumer.(CachedKafkaConsumer.scala:45)
at org.apache.spark.streaming.kafka010.CachedKafkaConsumer$.get(CachedKafkaConsumer.scala:194)
at org.apache.spark.streaming.kafka010.KafkaRDDIterator.(KafkaRDD.scala:252)
at org.apache.spark.streaming.kafka010.KafkaRDD.compute(KafkaRDD.scala:212)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:49)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
Here is snippet of my code of how I am creating my KafkaStream with spark stream-
val ssc = new StreamingContext(sc, Seconds(60))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "*boorstrap_url:port*",
"security.protocol" -> "SASL_PLAINTEXT",
"sasl.kerberos.service.name" -> "kafka",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "annotation-test",
//Tried commenting and uncommenting this property
//"partition.assignment.strategy"->"org.apache.kafka.clients.consumer.RangeAssignor",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean))
val topics = Array("*topic-name*")
val kafkaStream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams))
val valueKafka = kafkaStream.map(record => record.value())
I have gone through the following post -
https://issues.apache.org/jira/browse/KAFKA-4547
Pyspark Structured Streaming Kafka configuration error
According to this I have updated my kafka util jar in my fat jar to 0.10.2.0 version from 0.10.1.0 packaged by default from spark-stream-kafka-jar as transient dependency. Also my job is working fine when I am running it on single node by setting master as local. I am running spark 2.3.1 version.
Add kafka-clients-*.jar to your spark jar folder. kafka-clients-*.jar is in kafka-*/lib directory.

Spark Streaming works in Local mode but "stages fail" with "could not initialize class" in Client/Cluster mode

I have a Spark + Kafka streaming app that runs fine in Local mode, however when I try to launch it in yarn + local/cluster mode I get several errors like below
The first error I always see is
WARN TaskSetManager: Lost task 1.1 in stage 3.0 (TID 9, ip-xxx-24-129-36.ec2.internal, executor 2): java.lang.NoClassDefFoundError: Could not initialize class TestStreaming$
at TestStreaming$$anonfun$main$1$$anonfun$apply$1.apply(TestStreaming.scala:60)
at TestStreaming$$anonfun$main$1$$anonfun$apply$1.apply(TestStreaming.scala:59)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:917)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1944)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Next error I get is
ERROR JobScheduler: Error running job streaming job 1541786030000 ms.0
followed by
java.lang.NoClassDefFoundError: Could not initialize class
Spark version 2.1.0
Scala 2.11
Kafka version 10
Part of my code when I launch it loads the config in main. I pass this config file at runtime with -conf AFTER the jar (see below). I'm not quite sure but must I pass this config to the executors as well?
I launch my streaming app with the command below. One shows Local mode, the other shows client mode.
runJar = myProgram.jar
loggerPath=/path/to/log4j.properties
mainClass=TestStreaming
logger=-DPHDTKafkaConsumer.app.log4j=$loggerPath
confFile=application.conf
-----------Local Mode----------
SPARK_KAFKA_VERSION=0.10 nohup spark2-submit --driver-java-options
"$logger" --conf "spark.executor.extraJavaOptions=$logger" --class
$mainClass --master local[4] $runJar -conf $confFile &
-----------Client Mode----------
SPARK_KAFKA_VERSION=0.10 nohup spark2-submit --master yarn --conf >"spark.executor.extraJavaOptions=$logger" --conf >"spark.driver.extraJavaOptions=$logger" --class $mainClass $runJar -conf >$confFile &
Here is my code below. Been battling this for over a week now.
import Util.UtilFunctions
import UtilFunctions.config
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.log4j.Logger
object TestStreaming extends Serializable {
#transient lazy val logger: Logger = Logger.getLogger(getClass.getName)
def main(args: Array[String]) {
logger.info("Starting app")
UtilFunctions.loadConfig(args)
UtilFunctions.loadLogger()
val props: Map[String, String] = setKafkaProperties()
val topic = Set(config.getString("config.TOPIC_NAME"))
val conf = new SparkConf()
.setAppName(config.getString("config.SPARK_APP_NAME"))
.set("spark.streaming.backpressure.enabled", "true")
val spark = SparkSession.builder()
.config(conf)
.getOrCreate()
val ssc = new StreamingContext(spark.sparkContext, Seconds(10))
ssc.sparkContext.setLogLevel("INFO")
ssc.checkpoint(config.getString("config.SPARK_CHECKPOINT_NAME"))
val kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topic, props))
val distRecordsStream = kafkaStream.map(record => (record.key(), record.value()))
distRecordsStream.window(Seconds(10), Seconds(10))
distRecordsStream.foreachRDD(rdd => {
if(!rdd.isEmpty()) {
rdd.foreach(record => {
println(record._2) //value from kafka
})
}
})
ssc.start()
ssc.awaitTermination()
ssc.stop()
}
def setKafkaProperties(): Map[String, String] = {
val deserializer = "org.apache.kafka.common.serialization.StringDeserializer"
val zookeeper = config.getString("config.ZOOKEEPER")
val offsetReset = config.getString("config.OFFSET_RESET")
val brokers = config.getString("config.BROKERS")
val groupID = config.getString("config.GROUP_ID")
val autoCommit = config.getString("config.AUTO_COMMIT")
val maxPollRecords = config.getString("config.MAX_POLL_RECORDS")
val maxPollIntervalms = config.getString("config.MAX_POLL_INTERVAL_MS")
val props = Map(
"bootstrap.servers" -> brokers,
"zookeeper.connect" -> zookeeper,
"group.id" -> groupID,
"key.deserializer" -> deserializer,
"value.deserializer" -> deserializer,
"enable.auto.commit" -> autoCommit,
"auto.offset.reset" -> offsetReset,
"max.poll.records" -> maxPollRecords,
"max.poll.interval.ms" -> maxPollIntervalms)
props
}
}

Resources