SPARK HBASE issue - apache-spark

SPARK HBASE issue - apache-spark

Spark job is stopped due to following error and starts only after multiple attempts:
Caused by: java.io.InvalidClassException: org.apache.hadoop.hbase.spark.HBaseContext; local class incompatible: stream classdesc serialVersionUID = -5686505108908438419, local class serialVersionUID = -6879194698097628128
at java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:616)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1843)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1713)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2000)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2245)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2169)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2027)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1535)
Somebody please help.

Related

problem with mapgroupwithstate spark structured streaming

I have json records in kafka and I want to aggregate them with spark structured streaming.
sample record:
{"MSISDN":"09363650193","CALL_PARTNER":"1000","DURATION":20,"IMSI":"42311","CDR_TYPE":"1","SEQ_NO":10001,"RECORD_DATE":"2022-08-09 11:30:12","RECEIVED_AT":"2022-08-10 11:30:12"}
I want to have sum of "DURATION" for the last 10 elements of every "MSISDN", The output is like this:
"MSISDN":"09363650193","Total_Duration":60
here is my code:
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType,TimestampType}
import org.apache.spark.sql.functions.{col,from_json}
import spark.sqlContext.implicits._
import scala.collection.immutable.Queue
import java.sql.Timestamp
import org.apache.spark.sql.streaming.GroupState
import org.apache.spark.sql.streaming.GroupStateTimeout
val readStream = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "kafka1:9092").option("subscribe", "cs_cdr").option("startingOffsets", "earliest").load()
val df = readStream.selectExpr("CAST(value AS STRING)")
val schema = new StructType()
.add("MSISDN", StringType)
.add("CALL_PARTNER", StringType)
.add("DURATION", IntegerType)
.add("IMSI", StringType)
.add("CDR_TYPE", StringType)
.add("SEQ_NO", IntegerType)
.add("RECORD_DATE", TimestampType)
.add("RECEIVED_AT",StringType)
val dfJSON = df.withColumn("jsonData",from_json(col("value"),schema)).select("jsonData.*")
case class CallEvent(MSISDN: String,
CALL_PARTNER: String,
DURATION: Int,
IMSI: String,
CDR_TYPE: String,
SEQ_NO: String,
RECORD_DATE:String,
RECEIVED_AT:String)
val CallEvents=dfJSON.as[CallEvent]
case class FIFOBuffer[T](
capacity: Int, data: Queue[T] = Queue.empty
) extends Serializable {
def add(element: T): FIFOBuffer[T] =
this.copy(data = data.enqueue(element).take(capacity))
def get: List[T] = data.toList
def size: Int = data.size
}
case class EventSumDuration(MSISDN: String,
Total_Duration: Double)
def mappingFunction(
key: String,
values: Iterator[CallEvent],
state: GroupState[FIFOBuffer[CallEvent]]
): EventSumDuration = {
// the size of the window
val ElementCountWindowSize = 10
// get current state or create a new one if there's no previous state
val currentState = state.getOption
.getOrElse(
new FIFOBuffer[CallEvent](ElementCountWindowSize)
)
// enrich the state with the new events
val updatedState = values.foldLeft(currentState) {
case (st, ev) => st.add(ev)
}
// update the state with the enriched state
state.update(updatedState)
// otherwise, make a zeroed record
val data = updatedState.get
if (data.size > 2) {
val Total_Duration = data
.map(event => event.DURATION)
.sum
EventSumDuration(
key,
Total_Duration
)
} else {
EventSumDuration(
key,
0.0
)
}
}
val callEventsMovingSum = CallEvents
.groupByKey(record => record.MSISDN)
.mapGroupsWithState(GroupStateTimeout.ProcessingTimeTimeout)(mappingFunction)
val query = callEventsMovingSum.writeStream.outputMode("update").format("console").option("checkpointLocation","hdfs://172.17.135.31:9000/Checkpoints/map-gp-with-state").start().awaitTermination()
but I got error and I can not fix that.here is spark error:
22/08/10 12:12:19 ERROR TaskSetManager: Task 0 in stage 0.0 failed 4 times; aborting job
22/08/10 12:12:19 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite#6095feef is aborting.
22/08/10 12:12:19 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite#6095feef aborted.
22/08/10 12:12:19 ERROR MicroBatchExecution: Query [id = 0d252e5f-f0cb-4384-847f-9cc695db1a81, runId = 08324012-0a37-4e25-aae9-76d617932825] terminated with error
org.apache.spark.SparkException: Writing job aborted
at org.apache.spark.sql.errors.QueryExecutionErrors$.writingJobAbortedError(QueryExecutionErrors.scala:613)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:386)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:330)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.writeWithV2(WriteToDataSourceV2Exec.scala:279)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.run(WriteToDataSourceV2Exec.scala:290)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715)
at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2971)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2971)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$17(MicroBatchExecution.scala:603)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:598)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:375)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:373)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:598)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:228)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:375)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:373)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:193)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:187)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:303)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:286)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:209)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 10) (dn3 executor 6): java.lang.NoClassDefFoundError: Could not initialize class $line14.$read$
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.util.Utils$.classForName(Utils.scala:216)
at org.apache.spark.sql.catalyst.encoders.OuterScopes$.$anonfun$getOuterScope$2(OuterScopes.scala:69)
at org.apache.spark.sql.catalyst.expressions.objects.NewInstance.$anonfun$doGenCode$1(objects.scala:561)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.catalyst.expressions.objects.NewInstance.doGenCode(objects.scala:561)
at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:151)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:146)
at org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection$.$anonfun$create$1(GenerateSafeProjection.scala:156)
at scala.collection.immutable.List.map(List.scala:293)
at org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection$.create(GenerateSafeProjection.scala:153)
at org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection$.create(GenerateSafeProjection.scala:39)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:1362)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:1359)
at org.apache.spark.sql.execution.ObjectOperator$.deserializeRowToObject(objects.scala:153)
at org.apache.spark.sql.execution.AppendColumnsExec.$anonfun$doExecute$12(objects.scala:341)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:354)
... 40 more
Caused by: java.lang.NoClassDefFoundError: Could not initialize class $line14.$read$
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.util.Utils$.classForName(Utils.scala:216)
at org.apache.spark.sql.catalyst.encoders.OuterScopes$.$anonfun$getOuterScope$2(OuterScopes.scala:69)
at org.apache.spark.sql.catalyst.expressions.objects.NewInstance.$anonfun$doGenCode$1(objects.scala:561)
at scala.Option.map(Option.scala:230)
at org.apache.spark.sql.catalyst.expressions.objects.NewInstance.doGenCode(objects.scala:561)
at org.apache.spark.sql.catalyst.expressions.Expression.$anonfun$genCode$3(Expression.scala:151)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:146)
at org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection$.$anonfun$create$1(GenerateSafeProjection.scala:156)
at scala.collection.immutable.List.map(List.scala:293)
at org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection$.create(GenerateSafeProjection.scala:153)
at org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection$.create(GenerateSafeProjection.scala:39)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:1362)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:1359)
at org.apache.spark.sql.execution.ObjectOperator$.deserializeRowToObject(objects.scala:153)
at org.apache.spark.sql.execution.AppendColumnsExec.$anonfun$doExecute$12(objects.scala:341)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
org.apache.spark.sql.streaming.StreamingQueryException: Query [id = 0d252e5f-f0cb-4384-847f-9cc695db1a81, runId = 08324012-0a37-4e25-aae9-76d617932825] terminated with exception: Writing job aborted
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:325)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:209)
Caused by: org.apache.spark.SparkException: Writing job aborted
at org.apache.spark.sql.errors.QueryExecutionErrors$.writingJobAbortedError(QueryExecutionErrors.scala:613)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:386)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:330)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.writeWithV2(WriteToDataSourceV2Exec.scala:279)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.run(WriteToDataSourceV2Exec.scala:290)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715)
at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2971)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2971)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$17(MicroBatchExecution.scala:603)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:598)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:375)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:373)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:598)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:228)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:375)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:373)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:69)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:193)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:187)
at org.apache.spark.sql.execution.streaming.StreamExecution.$anonfun$runStream$1(StreamExecution.scala:303)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:286)
... 1 more
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 10) (dn3 executor 6): java.lang.NoClassDefFoundError: Could not initialize class

NullPointerException when using PubsubIO with Spark Runner in Apache Beam Pipeline

I have a very small illustrative Apache Beam pipeline that I'm trying to run with SparkRunner.
Below is the pipeline code
public class SparkMain {
public static void main(String[] args) {
PipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().create();
Pipeline pipeline = Pipeline.create(options);
final String projectId = "<my-project-id>";
final String dataset = "test_dataset";
Duration durations = DurationUtils.parseDuration("10s");
pipeline.apply("Read from PubSub",
PubsubIO.readMessagesWithAttributes().fromSubscription("my-subscription"))
.apply("Window",Window.<PubsubMessage>into(new GlobalWindows()).triggering(AfterWatermark.pastEndOfWindow()
.withEarlyFirings(AfterFirst.of(AfterPane.elementCountAtLeast(10),
AfterProcessingTime.pastFirstElementInPane().plusDelayOf(durations))))
.discardingFiredPanes())
.apply("Convert to String", ParDo.of(new DoFn<PubsubMessage, String>() {
#ProcessElement
public void processElement(ProcessContext context){
PubsubMessage msg = context.element();
String msgStr = new String(msg.getPayload());
context.output(msgStr);
}
}))
.apply("Write to File", TextIO
.write()
.withWindowedWrites()
.withNumShards(1)
.to("/Users/my-user/Documents/spark-beam-local/windowed-output"));
pipeline.run();
}
}
I'm using Apache Beam 2.16.0 and Spark 2.4.4 in local mode.
When I try to run this pipeline with DirectRunner or DataflowRunner everything works fine but when I switch the runner to SparkRunner the tasks start failing with following exception.
19/12/17 12:15:45 INFO MicrobatchSource: No cached reader found for split: [org.apache.beam.sdk.io.gcp.pubsub.PubsubUnboundedSource$PubsubSource#46d6c879]. Creating new reader at checkpoint mark null
19/12/17 12:15:46 WARN BlockManager: Putting block rdd_7_9 failed due to exception java.lang.NullPointerException.
19/12/17 12:15:46 WARN BlockManager: Block rdd_7_9 could not be removed as it was not found on disk or in memory
19/12/17 12:15:46 ERROR Executor: Exception in task 9.0 in stage 2.0 (TID 9)
java.lang.NullPointerException
at org.apache.beam.sdk.io.gcp.pubsub.PubsubUnboundedSource$PubsubReader.getWatermark(PubsubUnboundedSource.java:941)
at org.apache.beam.runners.spark.io.MicrobatchSource$Reader.getWatermark(MicrobatchSource.java:291)
at org.apache.beam.runners.spark.stateful.StateSpecFunctions$1.apply(StateSpecFunctions.java:181)
at org.apache.beam.runners.spark.stateful.StateSpecFunctions$1.apply(StateSpecFunctions.java:107)
at org.apache.spark.streaming.StateSpec$$anonfun$1.apply(StateSpec.scala:181)
at org.apache.spark.streaming.StateSpec$$anonfun$1.apply(StateSpec.scala:180)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$$anonfun$updateRecordWithData$1.apply(MapWithStateRDD.scala:57)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$$anonfun$updateRecordWithData$1.apply(MapWithStateRDD.scala:55)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$.updateRecordWithData(MapWithStateRDD.scala:55)
at org.apache.spark.streaming.rdd.MapWithStateRDD.compute(MapWithStateRDD.scala:159)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337)
at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I'm using following spark-submit
spark-submit --class SparkMain \
--master local[*] target/beam-1.0-SNAPSHOT.jar \
--runner=SparkRunner \
--project=<my-project> \
--gcpTempLocation=gs://<my-bucket>/temp \
--checkpointDir=/Users/my-user/Documents/beam-tmp/
There is a very similar but un-answered question Apache Beam pipeline with PubSubIO error using Spark Runner PubsubUnboundedSource$PubsubReader.getWatermark(PubsubUnboundedSource.java:1030)
Can anybody point me to direction how to start debugging the problem ?

spark streaming job suddenly exits on FileNotFoundException

I am running a Spark streaming application where each batches writes its final output to S3 in parquet format by using SqlContext.
I am able to get this application to run successfully in EMR.
However, after running for a couple of hours, the spark jobs suddenly halts on a FileNotFoundException.
I am not sure what to do next here.
Any pointers in how to debug/fix this issue would be useful.
I use Spark 2.2.1, EMR 5.1.1 and Java 8 for my application.
My streaming application code
public class StreamingApp {
JavaStreamingContext initDAG() {
JavaSparkContext sc = new JavaSparkContext(sparkConf);
// new context
JavaStreamingContext jssc = new JavaStreamingContext(sc, batchInterval);
SQLContext sqlContext = new SQLContext(sc);
...
// Converting to Dataset's Row type
JavaDStream<Row> rowStream = inputStream.map(new ObjectToRowMapperFunction());
// Writing to Disk
rowStream.foreachRDD(new RddToParquetFunction(sqlContext));
return jssc;
}
...
}
public class RddToParquetFunction implements VoidFunction<JavaRDD<Row>> {
private final StructType userStructType;
private final SQLContext sqlContext;
public RddToParquetFunction(SQLContext sqlContext) {
userStructType = ProtobufSparkStructMapper.schemaFor(UserMessage.class);
this.sqlContext = sqlContext;
}
#Override
public void call(JavaRDD<Row> rowRDD) throws Exception {
Dataset<Row> userDataFrame = sqlContext.createDataFrame(rowRDD, userStructType);
userDataFrame.write().mode(SaveMode.Append).parquet("s3://XXXXXXX/XXXXX/");
}
}
appropriate spark driver logs
18/02/15 22:47:57 ERROR ApplicationMaster: User class threw exception: org.apache.spark.SparkException: Job aborted.
org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:213)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:166)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:166)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:166)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:145)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(comm ands.scala:58)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
at org.apache.spark.sql.execution.datasources.DataSource.writeInFileFormat(DataSource.scala:435)
at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:471)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:50)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:117)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:138)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:135)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:92)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:92)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:609)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:233)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:217)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:508)
at app.functions.RddToParquetFunction.call(RddToParquetFunction.java:37)
at app.functions.RddToParquetFunction.call(RddToParquetFunction.java:17)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: File s3://XXXXXXX/XXXXX/output/_temporary/0/task_20180215224653_0267_m_000032 does not exist.
at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.listStatus(S3NativeFileSystem.java:996)
at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.listStatus(S3NativeFileSystem.java:937)
at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.listStatus(EmrFileSystem.java:337)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:426)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:362)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:334)
at org.apache.parquet.hadoop.ParquetOutputCommitter.commitJob(ParquetOutputCommitter.java:47)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:142)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply$mcV$sp(FileFormatWriter.scala:207)
... 57 more

Unless you pay the premium for Amazon's consistent EMR you can't reliably use S3 as a destination for your work.
ASF Hadoop+ Spark has fixed this on Hadoop 3.1+ with the S3A committers. Without that, and on amazon EMR, you need to write to HDFS and then use distcp to copy up the results if needed. If chaining together work, leave on HDFS.

KryoSerializer cannot find my SparkKryoRegistrator

I am using Spark 2.0.2 on Amazon emr-5.2.1 in client mode. I use Kryo serialisation and register our classes in our own KryoRegistrator:
val sparkConf = new SparkConf()
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.set("spark.kryo.registrator", classOf[de.gaf.ric.workflow.RicKryoRegistrator].getName)
.set("spark.kryo.registrationRequired", "true")
.set("spark.kryoserializer.buffer.max", "512m")
implicit val sc = new SparkContext(sparkConf)
The process starts fine, but after some minutes, I get the following exception on the executors:
17/02/02 16:22:34 ERROR RetryingBlockFetcher: Failed to fetch block rdd_3641_12, and will not retry (0 retries)
java.lang.RuntimeException: org.apache.spark.SparkException: Failed to register classes with Kryo
at org.apache.spark.serializer.KryoSerializer.newKryo(KryoSerializer.scala:129)
at org.apache.spark.serializer.KryoSerializerInstance.borrowKryo(KryoSerializer.scala:274)
at org.apache.spark.serializer.KryoSerializerInstance.<init>(KryoSerializer.scala:259)
at org.apache.spark.serializer.KryoSerializer.newInstance(KryoSerializer.scala:175)
at org.apache.spark.serializer.SerializerManager.dataSerializeWithExplicitClassTag(SerializerManager.scala:141)
at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doGetLocalBytes(BlockManager.scala:499)
at org.apache.spark.storage.BlockManager$$anonfun$getLocalBytes$2.apply(BlockManager.scala:474)
at org.apache.spark.storage.BlockManager$$anonfun$getLocalBytes$2.apply(BlockManager.scala:474)
at scala.Option.map(Option.scala:146)
at org.apache.spark.storage.BlockManager.getLocalBytes(BlockManager.scala:474)
at org.apache.spark.storage.BlockManager.getBlockData(BlockManager.scala:280)
at org.apache.spark.network.netty.NettyBlockRpcServer$$anonfun$2.apply(NettyBlockRpcServer.scala:60)
at org.apache.spark.network.netty.NettyBlockRpcServer$$anonfun$2.apply(NettyBlockRpcServer.scala:60)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.network.netty.NettyBlockRpcServer.receive(NettyBlockRpcServer.scala:60)
at org.apache.spark.network.server.TransportRequestHandler.processRpcRequest(TransportRequestHandler.java:159)
at org.apache.spark.network.server.TransportRequestHandler.handle(TransportRequestHandler.java:107)
at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:119)
at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:51)
at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:266)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:846)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: de.gaf.ric.workflow.RicKryoRegistrator
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.serializer.KryoSerializer$$anonfun$newKryo$5.apply(KryoSerializer.scala:124)
at org.apache.spark.serializer.KryoSerializer$$anonfun$newKryo$5.apply(KryoSerializer.scala:124)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at org.apache.spark.serializer.KryoSerializer.newKryo(KryoSerializer.scala:124)
... 43 more
at org.apache.spark.network.client.TransportResponseHandler.handle(TransportResponseHandler.java:189)
at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:121)
at org.apache.spark.network.server.TransportChannelHandler.channelRead0(TransportChannelHandler.java:51)
at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
at io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:266)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
at org.apache.spark.network.util.TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:308)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:294)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:846)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
at java.lang.Thread.run(Thread.java:745)
The class RicKryoRegistrator is definitively included in my uber JAR, I double checked that. It is also transferred to the executors:
17/02/02 16:19:02 INFO Executor: Fetching spark://172.31.20.106:41032/jars/app-imageprocessing-0.13.0-20170202.112920-1.jar with timestamp 1486048690879
17/02/02 16:19:02 INFO TransportClientFactory: Successfully created connection to /172.31.20.106:41032 after 23 ms (0 ms spent in bootstraps)
17/02/02 16:19:02 INFO Utils: Fetching spark://172.31.20.106:41032/jars/app-imageprocessing-0.13.0-20170202.112920-1.jar to /mnt/yarn/usercache/hadoop/appcache/application_1486039395474_0012/spark-8172edd9-d1c7-40c9-ad9b-74b2bd9dbad9/fetchFileTemp6474512860106916303.tmp
17/02/02 16:19:03 INFO Utils: Copying /mnt/yarn/usercache/hadoop/appcache/application_1486039395474_0012/spark-8172edd9-d1c7-40c9-ad9b-74b2bd9dbad9/-68603321486048690879_cache to /mnt/yarn/usercache/hadoop/appcache/application_1486039395474_0012/container_1486039395474_0012_01_000011/./app-imageprocessing-0.13.0-20170202.112920-1.jar
17/02/02 16:19:03 INFO Executor: Adding file:/mnt/yarn/usercache/hadoop/appcache/application_1486039395474_0012/container_1486039395474_0012_01_000011/./app-imageprocessing-0.13.0-20170202.112920-1.jar to class loader
Reading the source code of org.apache.spark.serializer.KryoSerializer, I see that it uses the following ClassLoader:
val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader)
Could it be that the defaultClassLoader is not set and that my uber JAR is not included in the Thread.currentThread.getContextClassLoader? What else could be the reason?

How to submit a spark Job Programmatically

While submitting the application with spark-submit it works.
But while trying to submit it Programmatically using the command below
mvn exec:java -Dexec.mainClass="org.cybergen.SubmitJobExample" -Dexec.args="/opt/spark/current/README.md Please"
Am getting the following error while trying to do so
Application Log
15/05/12 17:19:46 INFO AppClient$ClientActor: Connecting to master spark://cyborg:7077...
15/05/12 17:19:46 WARN ReliableDeliverySupervisor: Association with remote system [akka.tcp://sparkMaster#cyborg:7077] has failed, address is now gated for [5000] ms. Reason is: [Disassociated].
15/05/12 17:20:06 INFO AppClient$ClientActor: Connecting to master spark://cyborg:7077...
15/05/12 17:20:06 WARN ReliableDeliverySupervisor: Association with remote system [akka.tcp://sparkMaster#cyborg:7077] has failed, address is now gated for [5000] ms. Reason is: [Disassociated].
Spark Master Log
15/05/12 17:33:22 ERROR EndpointWriter: AssociationError [akka.tcp://sparkMaster#cyborg:7077] <- [akka.tcp://sparkDriver#10.18.26.116:49592]: Error [org.apache.spark.deploy.ApplicationDescription; local class incompatible: stream classdesc serialVersionUID = 7674242335164700840, local class serialVersionUID = 2596819202403185464] [
java.io.InvalidClassException: org.apache.spark.deploy.ApplicationDescription; local class incompatible: stream classdesc serialVersionUID = 7674242335164700840, local class serialVersionUID = 2596819202403185464
at java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:617)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1622)
at java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1517)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1771)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:1990)
at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1915)
at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1798)
at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1350)
at java.io.ObjectInputStream.readObject(ObjectInputStream.java:370)
at akka.serialization.JavaSerializer$$anonfun$1.apply(Serializer.scala:136)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at akka.serialization.JavaSerializer.fromBinary(Serializer.scala:136)
at akka.serialization.Serialization$$anonfun$deserialize$1.apply(Serialization.scala:104)
at scala.util.Try$.apply(Try.scala:161)
at akka.serialization.Serialization.deserialize(Serialization.scala:98)
at akka.remote.serialization.MessageContainerSerializer.fromBinary(MessageContainerSerializer.scala:63)
at akka.serialization.Serialization$$anonfun$deserialize$1.apply(Serialization.scala:104)
at scala.util.Try$.apply(Try.scala:161)
at akka.serialization.Serialization.deserialize(Serialization.scala:98)
at akka.remote.MessageSerializer$.deserialize(MessageSerializer.scala:23)
at akka.remote.DefaultMessageDispatcher.payload$lzycompute$1(Endpoint.scala:58)
at akka.remote.DefaultMessageDispatcher.payload$1(Endpoint.scala:58)
at akka.remote.DefaultMessageDispatcher.dispatch(Endpoint.scala:76)
at akka.remote.EndpointReader$$anonfun$receive$2.applyOrElse(Endpoint.scala:937)
at akka.actor.Actor$class.aroundReceive(Actor.scala:465)
at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:415)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
at akka.actor.ActorCell.invoke(ActorCell.scala:487)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
at akka.dispatch.Mailbox.run(Mailbox.scala:220)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:393)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
]
15/05/12 17:33:22 INFO Master: akka.tcp://sparkDriver#10.18.26.116:49592 got disassociated, removing it.
15/05/12 17:33:22 WARN ReliableDeliverySupervisor: Association with remote system [akka.tcp://sparkDriver#10.18.26.116:49592] has failed, address is now gated for [5000] ms. Reason is: [org.apache.spark.deploy.ApplicationDescription; local class incompatible: stream classdesc serialVersionUID = 7674242335164700840, local class serialVersionUID = 2596819202403185464].
15/05/12 17:33:22 INFO LocalActorRef: Message [akka.remote.transport.AssociationHandle$Disassociated] from Actor[akka://sparkMaster/deadLetters] to Actor[akka://sparkMaster/system/endpointManager/reliableEndpointWriter-akka.tcp%3A%2F%2FsparkDriver%4010.18.26.116%3A49592-6/endpointWriter/endpointReader-akka.tcp%3A%2F%2FsparkDriver%4010.18.26.116%3A49592-0#1749840468] was not delivered. [10] dead letters encountered. This logging can be turned off or adjusted with configuration settings 'akka.log-dead-letters' and 'akka.log-dead-letters-during-shutdown'.
15/05/12 17:33:22 INFO LocalActorRef: Message [akka.remote.transport.AssociationHandle$Disassociated] from Actor[akka://sparkMaster/deadLetters] to Actor[akka://sparkMaster/system/transports/akkaprotocolmanager.tcp0/akkaProtocol-tcp%3A%2F%2FsparkMaster%40127.0.0.1%3A50366-7#-1224275483] was not delivered. [11] dead letters encountered. This logging can be turned off or adjusted with configuration settings 'akka.log-dead-letters' and 'akka.log-dead-letters-during-shutdown'.
15/05/12 17:33:42 ERROR EndpointWriter: AssociationError [akka.tcp://sparkMaster#cyborg:7077] <- [akka.tcp://sparkDriver#10.18.26.116:49592]: Error [org.apache.spark.deploy.ApplicationDescription; local class incompatible: stream classdesc serialVersionUID = 7674242335164700840, local class serialVersionUID = 2596819202403185464] [
java.io.InvalidClassException: org.apache.spark.deploy.ApplicationDescription; local class incompatible: stream classdesc serialVersionUID = 7674242335164700840, local class serialVersionUID = 2596819202403185464
at java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:617)
at java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:1622)
SparkTestJob : is the spark job class
class SparkTestJob(val filePath:String="",val filter:String ="") extends Serializable{
def runWordCount() :Long = {
val conf = new SparkConf()
.setAppName("word count for the word"+filter)
.setMaster("spark://cyborg:7077")
.setJars(Seq("/tmp/spark-example-1.0-SNAPSHOT-driver.jar"))
.setSparkHome("/opt/spark/current")
val sc = new SparkContext(conf)
val file = sc.textFile(filePath)
file.filter(line => line.contains(filter)).count()
}
}
SubmitJobExample is the object which initiates the SparkTestJob Class
object SubmitJobExample {
def main(args: Array[String]):Unit={
if(args.length==2){
val fileName = args(0)
val filterByWord = args(1)
println("Reading file "+fileName+" for word "+filterByWord)
val jobObject = new SparkTestJob(fileName,filterByWord)
println("word count for the file "+fileName+" is "+jobObject.runWordCount())
}else{
val jobObject = new SparkTestJob("/opt/spark/current/README.md","Please")
println("word count for the file /opt/spark/current/README.md is "+jobObject.runWordCount())
}
}
}

Your code looks fine to me. To debug serialization problems run with -Dsun.io.serialization.extendedDebugInfo=true. This will print extra output upon NotSerializableException and you will see what it's trying to serialize.

The Actual problem was a mismatch of spark versions by one of the dependencies Changing all the dependencies to the same spark version Fixed the problem.
Reason why it worked while performing spark-submit is because of the java JAR-class-path precedence which used the correct spark jar version.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

SPARK HBASE issue - apache-spark

Related

problem with mapgroupwithstate spark structured streaming

NullPointerException when using PubsubIO with Spark Runner in Apache Beam Pipeline

spark streaming job suddenly exits on FileNotFoundException

KryoSerializer cannot find my SparkKryoRegistrator

How to submit a spark Job Programmatically

Categories

Resources