Cassandra Connector fails when run under Spark 2.3 on Kubernetes - apache-spark

I'm trying to use the connector, which I've used a bunch of times in the past super successfully, with the new Spark 2.3 native Kubernetes support and am running into a lot of trouble.
I have a super simple job that looks like this:
package io.rhom
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.cassandra._
import com.datastax.spark.connector.cql.CassandraConnectorConf
import com.datastax.spark.connector.rdd.ReadConf
/** Computes an approximation to pi */
object BackupLocations {
def main(args: Array[String]) {
val spark = SparkSession
.builder
.appName("BackupLocations")
.getOrCreate()
spark.sparkContext.hadoopConfiguration.set(
"fs.defaultFS",
"wasb://<snip>"
)
spark.sparkContext.hadoopConfiguration.set(
"fs.azure.account.key.rhomlocations.blob.core.windows.net",
"<snip>"
)
val df = spark
.read
.format("org.apache.spark.sql.cassandra")
.options(Map( "table" -> "locations", "keyspace" -> "test"))
.load()
df.write
.mode("overwrite")
.format("com.databricks.spark.avro")
.save("wasb://<snip>")
spark.stop()
}
}
which I'm building under SBT with Scala 2.11 and packaging with a Dockerfile that looks like this:
FROM timfpark/spark:20180305
COPY core-site.xml /opt/spark/conf
RUN mkdir -p /opt/spark/jars
COPY target/scala-2.11/rhom-backup-locations_2.11-0.1.0-SNAPSHOT.jar /opt/spark/jars
and then executing with:
bin/spark-submit --master k8s://blue-rhom-io.eastus2.cloudapp.azure.com:443 \
--deploy-mode cluster \
--name backupLocations \
--class io.rhom.BackupLocations \
--conf spark.executor.instances=2 \
--conf spark.cassandra.connection.host=10.1.0.10 \
--conf spark.kubernetes.container.image=timfpark/rhom-backup-locations:20180306v12 \
--jars https://dl.bintray.com/spark-packages/maven/datastax/spark-cassandra-connector/2.0.3-s_2.11/spark-cassandra-connector-2.0.3-s_2.11.jar,http://central.maven.org/maven2/org/apache/hadoop/hadoop-azure/2.7.2/hadoop-azure-2.7.2.jar,http://central.maven.org/maven2/com/microsoft/azure/azure-storage/3.1.0/azure-storage-3.1.0.jar,http://central.maven.org/maven2/com/databricks/spark-avro_2.11/4.0.0/spark-avro_2.11-4.0.0.jar \
local:///opt/spark/jars/rhom-backup-locations_2.11-0.1.0-SNAPSHOT.jar
all of this works except for the Cassandra connection piece, which eventually fails with:
2018-03-07 01:19:38 WARN TaskSetManager:66 - Lost task 0.0 in stage 0.0 (TID 0, 10.4.0.46, executor 1): org.apache.spark.SparkException: Task failed while writing rows.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:285)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:197)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:196)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Exception during preparation of SELECT "user_id", "timestamp", "accuracy", "altitude", "altitude_accuracy", "course", "features", "latitude", "longitude", "source", "speed" FROM "rhom"."locations" WHERE token("user_id") > ? AND token("user_id") <= ? ALLOW FILTERING: org/apache/spark/sql/catalyst/package$ScalaReflectionLock$
at com.datastax.spark.connector.rdd.CassandraTableScanRDD.createStatement(CassandraTableScanRDD.scala:323)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD.com$datastax$spark$connector$rdd$CassandraTableScanRDD$$fetchTokenRange(CassandraTableScanRDD.scala:339)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD$$anonfun$17.apply(CassandraTableScanRDD.scala:367)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD$$anonfun$17.apply(CassandraTableScanRDD.scala:367)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at com.datastax.spark.connector.util.CountingIterator.hasNext(CountingIterator.scala:12)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:380)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:269)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:267)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:272)
... 8 more
Caused by: java.lang.NoClassDefFoundError: org/apache/spark/sql/catalyst/package$ScalaReflectionLock$
at org.apache.spark.sql.catalyst.ReflectionLock$.<init>(ReflectionLock.scala:5)
at org.apache.spark.sql.catalyst.ReflectionLock$.<clinit>(ReflectionLock.scala)
at com.datastax.spark.connector.types.TypeConverter$.<init>(TypeConverter.scala:73)
at com.datastax.spark.connector.types.TypeConverter$.<clinit>(TypeConverter.scala)
at com.datastax.spark.connector.types.BigIntType$.converterToCassandra(PrimitiveColumnType.scala:50)
at com.datastax.spark.connector.types.BigIntType$.converterToCassandra(PrimitiveColumnType.scala:46)
at com.datastax.spark.connector.types.ColumnType$.converterToCassandra(ColumnType.scala:231)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD$$anonfun$11.apply(CassandraTableScanRDD.scala:312)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD$$anonfun$11.apply(CassandraTableScanRDD.scala:312)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at com.datastax.spark.connector.rdd.CassandraTableScanRDD.createStatement(CassandraTableScanRDD.scala:312)
... 23 more
Caused by: java.lang.ClassNotFoundException: org.apache.spark.sql.catalyst.package$ScalaReflectionLock$
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 41 more
2018-03-07 01:19:38 INFO TaskSetManager:54 - Starting task 0.1 in stage 0.0 (TID 3, 10.4.0.46, executor 1, partition 0, ANY, 9486 bytes)
I've tried every thing I can possibly think of to resolve this - anyone have any ideas? Is this possibly caused by another unrelated issue?

It turns out that version 2.0.7 of the Datastax Cassandra Connector does not support Spark 2.3 currently. I opened a JIRA ticket on Datastax's site for this and hopefully it will be addressed soon.

Related

Apache Spark Word2Vec throws org.apache.spark.shuffle.FetchFailedException: requirement failed

I run spark word2vec on a 2G of data with the following config using pyspark 3.0.0.
spark = SparkSession \
.builder \
.appName("word2vec") \
.master("local[*]") \
.config("spark.driver.memory", "32g") \
.config("spark.sql.execution.arrow.pyspark.enabled", "true") \
.getOrCreate()
And with running code simply like following,
sentence = sample_corpus.withColumn('sentences',f.split(sample_corpus.sentences, ',')).select('sentences')
word2vec = Word2Vec(vectorSize=300, inputCol="sentences", outputCol="result", minCount=10)
model = word2vec.fit(sentence)
However, it will throw following error during the computation which does not give many useful information for debugging.
FetchFailed(BlockManagerId(driver, fedora, 34105, None), shuffleId=2, mapIndex=0, mapId=73, reduceId=0, message=
org.apache.spark.shuffle.FetchFailedException: requirement failed
at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:748)
at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:663)
at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:70)
at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:484)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:490)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.util.CompletionIterator.hasNext(CompletionIterator.scala:31)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at org.apache.spark.util.collection.ExternalAppendOnlyMap.insertAll(ExternalAppendOnlyMap.scala:155)
at org.apache.spark.Aggregator.combineCombinersByKey(Aggregator.scala:50)
at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:110)
at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:313)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:127)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.IllegalArgumentException: requirement failed
at scala.Predef$.require(Predef.scala:268)
at org.apache.spark.storage.ShuffleBlockFetcherIterator$SuccessFetchResult.(ShuffleBlockFetcherIterator.scala:981)
at org.apache.spark.storage.ShuffleBlockFetcherIterator.fetchLocalBlocks(ShuffleBlockFetcherIterator.scala:422)
at org.apache.spark.storage.ShuffleBlockFetcherIterator.initialize(ShuffleBlockFetcherIterator.scala:536)
at org.apache.spark.storage.ShuffleBlockFetcherIterator.(ShuffleBlockFetcherIterator.scala:171)
at org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:83)
... 14 more
)
My machine has 64g memory. I wonder what is the cause here and how can I fix it. Thank you.

Kafka-pyspark Streaming: KafkaException: Failed to construct kafka consumer

I am trying to subscribe to a Kafka topic through pyspark with the following code:
spark = SparkSession.builder.appName("Spark Structured Streaming from Kafka").getOrCreate()
lines = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("kafka.partition.assignment.strategy","range").option("subscribe", "test-events").load()
words = lines.select(explode(split(lines.value, " ")).alias("word"))
wordCounts = words.groupBy("word").count()
query = wordCounts.writeStream.outputMode("complete").format("console").start()
query.awaitTermination()
and using the following command:
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 test_events.py
and versions for spark, kafka, java and scala:
spark=2.4.0
kafka=2.12-2.3.0
scala=2.11.12
openJDK=1.8.0_221
I keep getting the following errors:
Current State: ACTIVE
Thread State: RUNNABLE
Logical Plan:
Aggregate [word#26], [word#26, count(1) AS count#30L]
+- Project [word#26]
+- Generate explode(split(cast(value#8 as string), )), false, [word#26]
+- StreamingExecutionRelation KafkaV2[Subscribe[test-events]], [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:295)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:189)
Caused by: org.apache.kafka.common.KafkaException: Failed to construct kafka consumer
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:827)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:629)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:610)
at org.apache.spark.sql.kafka010.SubscribeStrategy.createConsumer(ConsumerStrategy.scala:62)
at org.apache.spark.sql.kafka010.KafkaOffsetReader.consumer(KafkaOffsetReader.scala:85)
at org.apache.spark.sql.kafka010.KafkaOffsetReader$$anonfun$fetchLatestOffsets$1$$anonfun$apply$9.apply(KafkaOffsetReader.scala:199)
at org.apache.spark.sql.kafka010.KafkaOffsetReader$$anonfun$fetchLatestOffsets$1$$anonfun$apply$9.apply(KafkaOffsetReader.scala:197)
at org.apache.spark.sql.kafka010.KafkaOffsetReader$$anonfun$org$apache$spark$sql$kafka010$KafkaOffsetReader$$withRetriesWithoutInterrupt$1.apply$mcV$sp(KafkaOffsetReader.scala:288)
at org.apache.spark.sql.kafka010.KafkaOffsetReader$$anonfun$org$apache$spark$sql$kafka010$KafkaOffsetReader$$withRetriesWithoutInterrupt$1.apply(KafkaOffsetReader.scala:287)
at org.apache.spark.sql.kafka010.KafkaOffsetReader$$anonfun$org$apache$spark$sql$kafka010$KafkaOffsetReader$$withRetriesWithoutInterrupt$1.apply(KafkaOffsetReader.scala:287)
at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(UninterruptibleThread.scala:77)
at org.apache.spark.sql.kafka010.KafkaOffsetReader.org$apache$spark$sql$kafka010$KafkaOffsetReader$$withRetriesWithoutInterrupt(KafkaOffsetReader.scala:286)
at org.apache.spark.sql.kafka010.KafkaOffsetReader$$anonfun$fetchLatestOffsets$1.apply(KafkaOffsetReader.scala:197)
at org.apache.spark.sql.kafka010.KafkaOffsetReader$$anonfun$fetchLatestOffsets$1.apply(KafkaOffsetReader.scala:197)
at org.apache.spark.sql.kafka010.KafkaOffsetReader.runUninterruptibly(KafkaOffsetReader.scala:255)
at org.apache.spark.sql.kafka010.KafkaOffsetReader.fetchLatestOffsets(KafkaOffsetReader.scala:196)
at org.apache.spark.sql.kafka010.KafkaMicroBatchReader$$anonfun$getOrCreateInitialPartitionOffsets$1.apply(KafkaMicroBatchReader.scala:195)
at org.apache.spark.sql.kafka010.KafkaMicroBatchReader$$anonfun$getOrCreateInitialPartitionOffsets$1.apply(KafkaMicroBatchReader.scala:190)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.kafka010.KafkaMicroBatchReader.getOrCreateInitialPartitionOffsets(KafkaMicroBatchReader.scala:190)
at org.apache.spark.sql.kafka010.KafkaMicroBatchReader.org$apache$spark$sql$kafka010$KafkaMicroBatchReader$$initialPartitionOffsets$lzycompute(KafkaMicroBatchReader.scala:83)
at org.apache.spark.sql.kafka010.KafkaMicroBatchReader.org$apache$spark$sql$kafka010$KafkaMicroBatchReader$$initialPartitionOffsets(KafkaMicroBatchReader.scala:83)
at org.apache.spark.sql.kafka010.KafkaMicroBatchReader.setOffsetRange(KafkaMicroBatchReader.scala:87)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch$1$$anonfun$5$$anonfun$apply$2.apply$mcV$sp(MicroBatchExecution.scala:353)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch$1$$anonfun$5$$anonfun$apply$2.apply(MicroBatchExecution.scala:353)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch$1$$anonfun$5$$anonfun$apply$2.apply(MicroBatchExecution.scala:353)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:351)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch$1$$anonfun$5.apply(MicroBatchExecution.scala:349)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch$1$$anonfun$5.apply(MicroBatchExecution.scala:341)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch$1.apply$mcZ$sp(MicroBatchExecution.scala:341)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch$1.apply(MicroBatchExecution.scala:337)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch$1.apply(MicroBatchExecution.scala:337)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.withProgressLocked(MicroBatchExecution.scala:554)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$constructNextBatch(MicroBatchExecution.scala:337)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:183)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:166)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:166)
at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:351)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:58)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:166)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:160)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:279)
... 1 more
Caused by: org.apache.kafka.common.KafkaException: range ClassNotFoundException exception occurred
at org.apache.kafka.common.config.AbstractConfig.getConfiguredInstances(AbstractConfig.java:425)
at org.apache.kafka.common.config.AbstractConfig.getConfiguredInstances(AbstractConfig.java:400)
at org.apache.kafka.common.config.AbstractConfig.getConfiguredInstances(AbstractConfig.java:387)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:772)
... 50 more
Caused by: java.lang.ClassNotFoundException: range
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.kafka.common.utils.Utils.loadClass(Utils.java:348)
at org.apache.kafka.common.utils.Utils.newInstance(Utils.java:337)
at org.apache.kafka.common.config.AbstractConfig.getConfiguredInstances(AbstractConfig.java:423)
... 53 more
During handling of the above exception, another exception occurred:
pyspark.sql.utils.StreamingQueryException: 'Failed to construct kafka consumer\n=== Streaming Query ===\nIdentifier: [id = 671c0c25-2f29-49f9-8698-c59a89626da7, runId = 37b4d397-4338-4416-a521-384c8853e99b]\nCurrent Committed Offsets: {}\nCurrent Available Offsets: {}\n\nCurrent State: ACTIVE\nThread State: RUNNABLE\n\nLogical Plan:\nAggregate [word#26], [word#26, count(1) AS count#30L]\n+- Project [word#26]\n +- Generate explode(split(cast(value#8 as string), )), false, [word#26]\n +- StreamingExecutionRelation KafkaV2[Subscribe[test-events]], [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13]\n'
2020-02-07 10:03:38 INFO SparkContext:54 - Invoking stop() from shutdown hoo
There are multiple similar questions online but no answer has worked for me so far.
I have also tried the above with spark 2.4.4 with the following:
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 test_events.py
but I keep getting the same errors.
Try changing the kafka.partition.assignment.strategy to roundrobin from range and see if it works.
lines = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("kafka.partition.assignment.strategy","roundrobin").option("subscribe", "test-events").load()
If it doesnt works even after that then try adding kafka-clients-0.10.0.1.jar while submitting the spark job.
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 --jars local:///root/sources/jars/kafka-clients-0.10.0.1.jar --driver-class-path local:///root/sources/jars/kafka-clients-0.10.0.1.jar test_events.py
Solved with the following:
kafka version 2.12-2.2.0
spark 2.4.0-bin-hadoop2.7
scala 2.11.12
java.lang.ClassNotFoundException: range
Unless you explicitly need the assignment strategy, then remove the option.
Otherwise, it must be the fully qualified Java class name
This error can also be drawn when you provide a faulty value for kafka.bootstrap.servers. This could be a non-existent broker/port, or even a broker list in list form, as opposed to string form. Meaning, ["broker1:9092", "broker2:9092"] instead of "broker1:9092,broker2:9092".
Depending on where you are running the code, the true cause of the error can be hidden, as well.
Here's the error in Jupyter
StreamingQueryException: Failed to construct kafka consumer
=== Streaming Query ===
Identifier: [id = 39eb0e9d-9487-4838-9d15-241645a04cb6, runId = 763acdcb-bc05-4428-87e1-7b56ae736423]
Current Committed Offsets: {KafkaV2[Subscribe[fd]]: {"fd":{"2":4088,"1":4219,"0":4225}}}
Current Available Offsets: {KafkaV2[Subscribe[fd]]: {"fd":{"2":4088,"1":4219,"0":4225}}}
Current State: ACTIVE
Thread State: RUNNABLE
Logical Plan:
WriteToMicroBatchDataSource org.apache.spark.sql.kafka010.KafkaStreamingWrite#457e8cfa
+- StreamingDataSourceV2Relation [key#7, value#8, topic#9, partition#10, offset#11L, timestamp#12, timestampType#13], org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaScan#2b34a4 79, KafkaV2[Subscribe[fd]]
No mention of any problems with the broker list... Now here's the same error via spark-submit:
2021-08-13 20:30:44,377 WARN kafka010.KafkaOffsetReaderConsumer: Error in attempt 3 getting Kafka offsets:
org.apache.kafka.common.KafkaException: Failed to construct kafka consumer
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:823)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:632)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:613)
at org.apache.spark.sql.kafka010.SubscribeStrategy.createConsumer(ConsumerStrategy.scala:107)
at org.apache.spark.sql.kafka010.KafkaOffsetReaderConsumer.consumer(KafkaOffsetReaderConsumer.scala:82)
at org.apache.spark.sql.kafka010.KafkaOffsetReaderConsumer.$anonfun$partitionsAssignedToConsumer$2(KafkaOffsetReaderConsumer.scala:533)
at org.apache.spark.sql.kafka010.KafkaOffsetReaderConsumer.$anonfun$withRetriesWithoutInterrupt$1(KafkaOffsetReaderConsumer.scala:578)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.UninterruptibleThread.runUninterruptibly(UninterruptibleThread.scala:77)
at org.apache.spark.sql.kafka010.KafkaOffsetReaderConsumer.withRetriesWithoutInterrupt(KafkaOffsetReaderConsumer.scala:577)
at org.apache.spark.sql.kafka010.KafkaOffsetReaderConsumer.$anonfun$partitionsAssignedToConsumer$1(KafkaOffsetReaderConsumer.scala:531)
at org.apache.spark.util.UninterruptibleThreadRunner.runUninterruptibly(UninterruptibleThreadRunner.scala:48)
at org.apache.spark.sql.kafka010.KafkaOffsetReaderConsumer.partitionsAssignedToConsumer(KafkaOffsetReaderConsumer.scala:531)
at org.apache.spark.sql.kafka010.KafkaOffsetReaderConsumer.fetchLatestOffsets(KafkaOffsetReaderConsumer.scala:311)
at org.apache.spark.sql.kafka010.KafkaMicroBatchStream.latestOffset(KafkaMicroBatchStream.scala:87)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$constructNextBatch$3(MicroBatchExecution.scala:394)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$constructNextBatch$2(MicroBatchExecution.scala:385)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
at scala.collection.immutable.Map$Map1.foreach(Map.scala:128)
at scala.collection.TraversableLike.map(TraversableLike.scala:238)
at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$constructNextBatch$1(MicroBatchExecution.scala:382)
at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.withProgressLocked(MicroBatchExecution.scala:613)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.constructNextBatch(MicroBatchExecution.scala:378)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:211)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:357)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:355)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:68)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:194)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:188)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:334)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:317)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:244)
Important part!
Caused by: org.apache.kafka.common.config.ConfigException: Invalid url in bootstrap.servers: ['192.168.1.162:9092'
at org.apache.kafka.clients.ClientUtils.parseAndValidateAddresses(ClientUtils.java:59)
at org.apache.kafka.clients.ClientUtils.parseAndValidateAddresses(ClientUtils.java:48)
at org.apache.kafka.clients.consumer.KafkaConsumer.<init>(KafkaConsumer.java:734)
... 41 more
Change kafka.bootstrap.servers from ["192.168.1.162:9092","192.168.1.161:9092","192.168.1.160:9092"] to "192.168.1.162:9092,192.168.1.161:9092,192.168.1.160:9092" and all is well.
Confirm by using kafkacat to ensure that your broker is where you are saying it is.
e.g. kafkacat -C -b 192.168.1.162:9092,192.168.1.161:9092 -t fd
Version Info:
Spark 3.1.2
PySpark 3.1.1
Key .jars:
sparkSesh = SparkSession.builder.config("spark.driver.extraClassPath", "/home/username/jars/spark-sql-kafka-0-10_2.12-3.1.2.jar,/home/username/jars/commons-pool2-2.11.0.jar")\ .appName("Kafka to Stream") \ .master("local[*]").getOrCreate()

Apache Ignite Spark Integration persists data to Ignite error

I want to persist the DataFrame data to the ignite table, and native debugging is normal. When running on spark on yarn, it always reports an error.
df.write
.format(IgniteDataFrameSettings.FORMAT_IGNITE)
.option(IgniteDataFrameSettings.OPTION_CONFIG_FILE, "META-INF/default-config.xml")
.option(IgniteDataFrameSettings.OPTION_STREAMER_ALLOW_OVERWRITE, true)
.option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS, "primary_key")
.option(IgniteDataFrameSettings.OPTION_STREAMER_FLUSH_FREQUENCY, 10000)
.option(IgniteDataFrameSettings.OPTION_TABLE, "tb_name")
.option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PARAMETERS, "template=partitioned,backups=0,affinityKey=groupcode")
.mode(SaveMode.Overwrite)
.save()
This is spark config
/data/spark-2.3.0-bin-hadoop2.7/bin/spark-submit\
--queue default\
--class cn.com.DataProcess\
--name DataProcess\
--master yarn\
--deploy-mode client \
--conf spark.default.parallelism=20\
--conf spark.executor.extraClassPath=/data/ignite/*\
--conf spark.driver.extraClassPath=/data/ignite/*\
--conf spark.driver.extraJavaOptions=-Dfile.encoding=utf8\
--conf spark.executor.extraJavaOptions=-Dfile.encoding=utf8\
--num-executors 6\
--executor-memory 1G\
--total-executor-cores 6\
--jars /data/ignite/service-2.0-SNAPSHOT-jar-with-dependencies.jar /data/ignite/service-2.0-SNAPSHOT.jar
Error stack:
Job aborted due to stage failure: Task 6 in stage 3.0 failed 4 times, most recent failure: Lost task 6.3 in stage 3.0 (TID 451, hadoop-slave2, executor 4): java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:347)
at scala.None$.get(Option.scala:345)
at org.apache.ignite.spark.impl.QueryHelper$.org$apache$ignite$spark$impl$QueryHelper$$savePartition(QueryHelper.scala:155)
at org.apache.ignite.spark.impl.QueryHelper$$anonfun$saveTable$1.apply(QueryHelper.scala:117)
at org.apache.ignite.spark.impl.QueryHelper$$anonfun$saveTable$1.apply(QueryHelper.scala:116)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1$$anonfun$apply$29.apply(RDD.scala:929)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Which friend has encountered this problem, please help me!
I found the source code based on the error message and judged that it should be the error suggested here。
private def savePartition(iterator: Iterator[Row],
insertQry: String,
tblName: String,
ctx: IgniteContext,
streamerAllowOverwrite: Option[Boolean],
streamerFlushFrequency: Option[Long],
streamerPerNodeBufferSize: Option[Int],
streamerPerNodeParallelOperations: Option[Int]
): Unit = {
**val tblInfo = sqlTableInfo[Any, Any](ctx.ignite(), tblName).get**
val streamer = ctx.ignite().dataStreamer(tblInfo._1.getName)
}
But I don't understand why this happens. Both cache name and table name exist. Is my code used incorrectly?

shc-core: NoSuchMethodError org.apache.hadoop.hbase.client.Put.addColumn

I try to use shc-core to save spark dataframe into hbase via spark.
My versions:
hbase: 1.1.2.2.6.4.0-91
spark: 1.6
scala: 2.10
shc: 1.1.1-1.6-s_2.10
hdp: 2.6.4.0-91
Configuration looks like that:
val schema_array = s"""{"type": "array", "items": ["string","null"]}""".stripMargin
def catalog: String = s"""{
|"table":{"namespace":"default", "name":"tblename"},
|"rowkey":"id",
|"columns":{
|"id":{"cf":"rowkey", "col":"id", "type":"string"},
|"col1":{"cf":"data", "col":"col1", "avro":"schema_array"}
|}
|}""".stripMargin
df
.write
.options(Map(
"schema_array"-> schema_array,
HBaseTableCatalog.tableCatalog -> catalog,
HBaseTableCatalog.newTable -> "5"
))
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()
Sometimes it works fine as expected and creates table and saves all the data into hbase. But sometimes just fail with following error:
Lost task 35.0 in stage 9.0 (TID 301, host): java.lang.NoSuchMethodError: org.apache.hadoop.hbase.client.Put.addColumn([B[B[B)Lorg/apache/hadoop/hbase/client/Put;
at org.apache.spark.sql.execution.datasources.hbase.HBaseRelation$$anonfun$org$apache$spark$sql$execution$datasources$hbase$HBaseRelation$$convertToPut$1$1.apply(HBaseRelation.scala:211)
at org.apache.spark.sql.execution.datasources.hbase.HBaseRelation$$anonfun$org$apache$spark$sql$execution$datasources$hbase$HBaseRelation$$convertToPut$1$1.apply(HBaseRelation.scala:210)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
at org.apache.spark.sql.execution.datasources.hbase.HBaseRelation.org$apache$spark$sql$execution$datasources$hbase$HBaseRelation$$convertToPut$1(HBaseRelation.scala:210)
at org.apache.spark.sql.execution.datasources.hbase.HBaseRelation$$anonfun$insert$1.apply(HBaseRelation.scala:219)
at org.apache.spark.sql.execution.datasources.hbase.HBaseRelation$$anonfun$insert$1.apply(HBaseRelation.scala:219)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply$mcV$sp(PairRDDFunctions.scala:1112)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply(PairRDDFunctions.scala:1111)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12$$anonfun$apply$4.apply(PairRDDFunctions.scala:1111)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1277)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1119)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1$$anonfun$12.apply(PairRDDFunctions.scala:1091)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:247)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Any ideas?
That was actually a class path issue - I've got two different versions of hbase client.

Spark 2.0 + Kryo Serializer + Avro -> NullPointerException?

I have the simple pyspark program:
from pyspark import SQLContext
from pyspark import SparkConf
from pyspark import SparkContext
if __name__ == "__main__":
spark_settings = {
"spark.serializer": 'org.apache.spark.serializer.KryoSerializer'
}
conf = SparkConf()
conf.setAll(spark_settings.items())
spark_context = SparkContext(appName="test app", conf=conf)
spark_sql_context = SQLContext(spark_context)
source_path = "s3n://my_bucket/data.avro"
data_frame = spark_sql_context.read.load(source_path, format="com.databricks.spark.avro")
# The schema comes back correctly.
data_frame.printSchema()
# This count() call fails. A call to head() triggers the same error.
data_frame.count()
I run with
$SPARK_HOME/bin/spark-submit --master yarn \
--packages com.databricks:spark-avro_2.11:3.0.0 \
bug_isolation.py
It fails with the following exception and stack trace.
If I switch to --master local it works. If I disable the KryoSerializer option, it works. Or if I use a Parquet source rather than an Avro source it works.
The combination of using --master yarn and the KryoSerializer and an Avro source triggers the exception and stack trace listed below.
I suspect I may need to manually register some Avro plugin classes with the KryoSerializer for it to work? Which classes would I need to register.
File "/usr/lib/spark/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py", line 312, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o58.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 0.0 failed 4 times, most recent failure: Lost task 3.3 in stage 0.0 (TID 9, ip-172-31-97-24.us-west-2.compute.internal): java.lang.NullPointerException
at com.databricks.spark.avro.DefaultSource$$anonfun$buildReader$1.apply(DefaultSource.scala:151)
at com.databricks.spark.avro.DefaultSource$$anonfun$buildReader$1.apply(DefaultSource.scala:143)
at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(fileSourceInterfaces.scala:279)
at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(fileSourceInterfaces.scala:263)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:116)
at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:91)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.agg_doAggregateWithoutKey$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)

Resources