Spark sortby throwing exception - apache-spark

I'm trying to sort JavaPairRDD by Key.
Configuration
Spark version : 1.3.0
mode: local
Can some one look into my code where I'm doing wrong.
JavaPairRDD<String, HashMap<String, Object>> countAndSum = grupBydate
.reduceByKey(new Function2<HashMap<String, Object>, HashMap<String, Object>, HashMap<String, Object>>() {
#Override
public HashMap<String, Object> call(
HashMap<String, Object> v1,
HashMap<String, Object> v2)
throws Exception {
long count = Long.parseLong(v1.get(
SparkToolConstant.COUNT)
.toString())
+ Long.parseLong(v2
.get(SparkToolConstant.COUNT)
.toString());
Double sum = Double.parseDouble(v1.get(
SparkToolConstant.VALUE)
.toString())
+ Double.parseDouble(v2
.get(SparkToolConstant.VALUE)
.toString());
HashMap<String, Object> sumMap = new HashMap<String, Object>();
sumMap.put(SparkToolConstant.COUNT,
count);
sumMap.put(SparkToolConstant.VALUE, sum);
return sumMap;
}
});
System.out.println("count before sorting : "
+ countAndSum.count());
/**
sort by date
*/
JavaPairRDD<String, HashMap<String, Object>> sortByDate = countAndSum
.sortByKey(new Comparator<String>() {
#Override
public int compare(String dateStr1,
String dateStr2) {
DateUtil dateUtil = new DateUtil();
Date date1 = dateUtil.stringToDate(
dateStr1, dateFormat);
Date date2 = dateUtil.stringToDate(
dateStr2, dateFormat);
if (date2 == null && date1 == null) {
return 0;
} else if (date2 != null
&& date1 != null) {
return date1.compareTo(date2);
} else if (date2 == null) {
return 1;
} else {
return -1;
}
}
});
Getting Error here
System.out.println("count after sorting : "
+ sortByDate.count());
Stack trace when Task submit in Spark using spark-submit as a local mode
SchedulerImpl:59 - Cancelling stage 252
2015-04-29 14:37:19 INFO DAGScheduler:59 - Job 62 failed: count at DataValidation.java:378, took 0.107696 s
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task serialization failed: java.lang.reflect.InvocationTargetException
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.lang.reflect.Method.invoke(Method.java:606)
org.apache.spark.serializer.SerializationDebugger$ObjectStreamClassMethods$.getObjFieldValues$extension(SerializationDebugger.scala:240)
org.apache.spark.serializer.SerializationDebugger$SerializationDebugger.visitSerializable(SerializationDebugger.scala:150)
org.apache.spark.serializer.SerializationDebugger$SerializationDebugger.visit(SerializationDebugger.scala:99)
org.apache.spark.serializer.SerializationDebugger$SerializationDebugger.visitSerializable(SerializationDebugger.scala:158)
org.apache.spark.serializer.SerializationDebugger$SerializationDebugger.visit(SerializationDebugger.scala:99)
org.apache.spark.serializer.SerializationDebugger$.find(SerializationDebugger.scala:58)
org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:39)
org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:80)
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitMissingTasks(DAGScheduler.scala:835)
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply$mcVI$sp(DAGScheduler.scala:1042)
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply(DAGScheduler.scala:1039)
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply(DAGScheduler.scala:1039)
scala.Option.foreach(Option.scala:236)
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15.apply(DAGScheduler.scala:1039)
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15.apply(DAGScheduler.scala:1038)
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1038)
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1390)
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$submitMissingTasks(DAGScheduler.scala:847)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply$mcVI$sp(DAGScheduler.scala:1042)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply(DAGScheduler.scala:1039)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15$$anonfun$apply$1.apply(DAGScheduler.scala:1039)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15.apply(DAGScheduler.scala:1039)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskCompletion$15.apply(DAGScheduler.scala:1038)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:1038)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1390)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
Blockquote

Spark would serialize the function you passed in reduceByKey and sorByKey first and pass them to executors. Therefore, you should guarantee that your functions are serializable there
SparkToolConstant & DateUtil in your code seems like the reason causes this error.

Related

Exception occurred during parquet saving - SparkException: Task not serializable - NotSerializableException - object not serializable

The DiagnosticEvent class is an Avro generating class and it's having serialVersionUID as well.
20/01/05 09:56:09 ERROR nodeStatsConfigDriven.NodeStatsKafkaProcessor: DiagnosticEvent Exception occurred during parquet saving
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2289)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2063)
at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1354)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.take(RDD.scala:1327)
at org.apache.spark.rdd.RDD$$anonfun$isEmpty$1.apply$mcZ$sp(RDD.scala:1462)
at org.apache.spark.rdd.RDD$$anonfun$isEmpty$1.apply(RDD.scala:1462)
at org.apache.spark.rdd.RDD$$anonfun$isEmpty$1.apply(RDD.scala:1462)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.isEmpty(RDD.scala:1461)
at org.apache.spark.api.java.JavaRDDLike$class.isEmpty(JavaRDDLike.scala:544)
at org.apache.spark.api.java.AbstractJavaRDDLike.isEmpty(JavaRDDLike.scala:45)
at Spark2ParquetEngine.nodeStatsConfigDriven.NodeStatsKafkaProcessor.processSubRecordList(NodeStatsKafkaProcessor.java:463)
at Spark2ParquetEngine.nodeStatsConfigDriven.NodeStatsKafkaProcessor.access$100(NodeStatsKafkaProcessor.java:42)
at Spark2ParquetEngine.nodeStatsConfigDriven.NodeStatsKafkaProcessor$4.call(NodeStatsKafkaProcessor.java:252)
at Spark2ParquetEngine.nodeStatsConfigDriven.NodeStatsKafkaProcessor$4.call(NodeStatsKafkaProcessor.java:273)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:272)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.NotSerializableException: com.servicenow.bigdata.schema.nodeStats.DiagnosticEvent
Serialization stack:
- object not serializable (class: com.servicenow.bigdata.schema.nodeStats.DiagnosticEvent, value: {"schema_version": 2, etc...})
- writeObject data (class: java.util.ArrayList)
- object (class java.util.ArrayList, [{"schema_version": 2, etc...}])
- writeObject data (class: java.util.HashMap)
- object (class java.util.HashMap, {d12e3671478a0602f82a17c94c88155a=[{"schema_version": 2, etc}]
I've searched for this issue and tried in all the possible ways as mentioned in the posts. But, could not able to resolve the issue in my streaming application.
Please help in fixing this issue in streaming application while saving the data.
Code::
The exception is point to nodeStats.foreachRDD
for(ChildNodeConfig childNodeConfig : xmlStatsConfig.getChildNodesList().getChildNodes()){
if(childNodeConfig.isUseStatefulCacheFilter() && statefulFiltersMap.containsKey(childNodeConfig.getNodeFilterConfig().getName())){
//logger.info("Applying stateful filter : "+childNodeConfig.getNodeFilterConfig().getName());
JavaPairDStream<Void, Node> nodeStats = statefulFiltersMap.get(childNodeConfig.getNodeFilterConfig().getName()).applyStatefulFilter(nodeStats,intialStatefulRDDs.get(childNodeConfig.getNodeFilterConfig().getName()),hiveContext);
nodeStats.foreachRDD(new VoidFunction<JavaPairRDD<Void,Node>>() {
#Override
public void call(JavaPairRDD<Void, Node> rdd) throws Exception {
try {
//Only Saving Diagnostic_Events - End */
logger.error("Filter the RDD into currentPartition");
// filter the RDD into two partitions
JavaPairRDD<Void, Node> currentPartition = rdd.filter(new Function<Tuple2<Void, Node>, Boolean>() {
#Override
public Boolean call(Tuple2<Void, Node> tuple) throws Exception {
if (tuple != null) {
Node nodeStats = tuple._2();
if (nodeStats != null && nodeStats.getCollectionTimestamp() != null) {
long timestamp = nodeStats.getCollectionTimestamp() * 1000;
if (timestamp >= currHourTimestamp) {
return true;
}
}
}
return false;
}
});
if(processChildNodes || processAll) {
//logger.info("Processing Child Nodes");
for (ChildNodeConfig childNodeConfig : xmlStatsConfig.getChildNodesList().getChildNodes()) {
//logger.info("Testing: calling processSubRecordList for childNode-" + childNodeConfig.getName());
processSubRecordList(currentPartition, prevPartition, currentTime, properties,
childNodeConfig.getName(), childNodeConfig.getNamespace(), childNodeConfig.getHdfsdir(), true);
}
}
} catch (Exception ex) {
logger.error("Exception occurred during parquet saving", ex);
}
}
});
}
}
Exception occurred at this position on !currentPartition.isEmpty() check.
if (!currentPartition.isEmpty() && !currentPartition.partitions().isEmpty()) {
logger.error("Saving the Child:: " + targetClass + " save current partition to " + tempPath);
logger.info(targetClass + " save current partition to " + tempPath);
currentPartition.saveAsNewAPIHadoopFile(tempPath + "/current",
Void.class, clazz, AvroParquetOutputFormat.class, sparkJob.getConfiguration());
logger.info(targetClass + " saved current partition to " + tempPath);
// move the output parquet files into their respective partition
UtilSpark2.moveTempParquetToDestHDFS(tempPath + "/current", hdfsOutputDir + currentDir, String.valueOf(currentTime), hdfs);
} else
logger.info(targetClass + " Empty current partition.");

java.lang.RuntimeException: org.apache.spark.SparkException: Task not serializable at solr.DefaultSource.createRelation

I have seen many such posts for serialization error. But I am new to this.
There is a dataframe-modProductsData and a map L2L3Map Map. I want to replace the values in column-PRIMARY_CATEGORY with values of map-L2L3Map.
val L2L3Map = L2.collect.map(row => (row.get(0).toString, row.get(1).toString)).toMap
val L2L3MapUDF = udf { s: String => L2L3Map.get(s) }
val productsData = spark.read.format("solr").options(readFromSourceClusterOpts).load
var modProductsData = productsData.withColumn("Prime_L2_s", when(col("PRIMARY_CATEGORY").isNotNull, when(col("PRIMARY_CATEGORY").isin(L3ids:_*), L2L3MapUDF(col("PRIMARY_CATEGORY"))).otherwise(when(col("PRIMARY_CATEGORY").isin(L2ids:_*),col("PRIMARY_CATEGORY")).otherwise(lit(null)))).otherwise(lit(null)))
Below are the more error log:
java.lang.RuntimeException: org.apache.spark.SparkException: Task not serializable
at solr.DefaultSource.createRelation(DefaultSource.scala:31)
at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:518)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:215)
... 89 elided
Caused by: org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2101)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:841)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:840)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:840)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:371)
at
It worked with below code :)
def mapPhantom(flagMap: Map[String, String]): (String) => String = {
(id: String) =>
{
flagMap.getOrElse(id,null)
}
}
val L2L3Map = L2.collect.map(row => (row.get(0).toString, row.get(1).toString)).toMap
val L2L3UDF = udf(mapPhantom(L2L3Map))
var modProductsData = productsData.withColumn("Prime_L2_s", when(col("PRIMARY_CATEGORY").isNotNull, when(col("PRIMARY_CATEGORY").isin(L3ids:_*), L2L3UDF(col("PRIMARY_CATEGORY"))).otherwise(when(col("PRIMARY_CATEGORY").isin(L2ids:_*),col("PRIMARY_CATEGORY")).otherwise(lit(null)))).otherwise(lit(null)))

MapWithState gives java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast while recovering from checkpoint

I am facing an issue with spark streaming job where i am trying to use broadcast, mapWithState and checkpointing together in spark.
Following is the usage:
Since I have to pass some connection object (which is not Serializable) to the executors, I am using org.apache.spark.broadcast.Broadcast
Since we have to maintain some cached information i am using stateful streaming with mapWithState
Also I am using checkpointing of my streaming context
I also need to pass the broadcasted connection object into the mapWithState for fetching some data from an external source.
The flow is working just fine when the context is created newly. However when i crash the application and try to recover from checkpoint I get a ClassCastException.
I have put a small code snippet based on an example from asyncified.io to reproduce the issue in github:
My broadcast logic is yuvalitzchakov.utils.KafkaWriter.scala
The dummy logic of the application is yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast.scala
Dummy snippet of the code:
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("spark-stateful-example")
...
val prop = new Properties()
...
val config: Config = ConfigFactory.parseString(prop.toString)
val sc = new SparkContext(sparkConf)
val ssc = StreamingContext.getOrCreate(checkpointDir, () => {
println("creating context newly")
clearCheckpoint(checkpointDir)
val streamingContext = new StreamingContext(sc, Milliseconds(batchDuration))
streamingContext.checkpoint(checkpointDir)
...
val kafkaWriter = SparkContext.getOrCreate().broadcast(kafkaErrorWriter)
...
val stateSpec = StateSpec.function((key: Int, value: Option[UserEvent], state: State[UserSession]) =>
updateUserEvents(key, value, state, kafkaWriter)).timeout(Minutes(jobConfig.getLong("timeoutInMinutes")))
kafkaTextStream
.transform(rdd => {
offsetsQueue.enqueue(rdd.asInstanceOf[HasOffsetRanges].offsetRanges)
rdd
})
.map(deserializeUserEvent)
.filter(_ != UserEvent.empty)
.mapWithState(stateSpec)
.foreachRDD { rdd =>
...
some logic
...
streamingContext
})
}
ssc.start()
ssc.awaitTermination()
def updateUserEvents(key: Int,
value: Option[UserEvent],
state: State[UserSession],
kafkaWriter: Broadcast[KafkaWriter]): Option[UserSession] = {
...
kafkaWriter.value.someMethodCall()
...
}
I get the following error when
kafkaWriter.value.someMethodCall()
is executed:
17/08/01 21:20:38 ERROR Executor: Exception in task 2.0 in stage 3.0 (TID 4)
java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to yuvalitzchakov.utils.KafkaWriter
at yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast$.updateUserSessions$1(SparkStatefulRunnerWithBroadcast.scala:144)
at yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast$.updateUserEvents(SparkStatefulRunnerWithBroadcast.scala:150)
at yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast$$anonfun$2.apply(SparkStatefulRunnerWithBroadcast.scala:78)
at yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast$$anonfun$2.apply(SparkStatefulRunnerWithBroadcast.scala:77)
at org.apache.spark.streaming.StateSpec$$anonfun$1.apply(StateSpec.scala:181)
at org.apache.spark.streaming.StateSpec$$anonfun$1.apply(StateSpec.scala:180)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$$anonfun$updateRecordWithData$1.apply(MapWithStateRDD.scala:57)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$$anonfun$updateRecordWithData$1.apply(MapWithStateRDD.scala:55)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$.updateRecordWithData(MapWithStateRDD.scala:55)
at org.apache.spark.streaming.rdd.MapWithStateRDD.compute(MapWithStateRDD.scala:159)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:336)
at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:334)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Basically kafkaWriter is the broadcast variable and kafkaWriter.value should return us the broadcasted variable but it is returning SerializableCongiguration which is not getting casted to the desired object
Thanks in advance for help!
Broadcast variable cannot be used with MapwithState(transformation operations in general) if we need to recover from checkpoint directory in Spark streaming. It can only be used inside output operations in that case as it requires Spark context to lazily initialize the broadcast
class JavaWordBlacklist {
private static volatile Broadcast<List<String>> instance = null;
public static Broadcast<List<String>> getInstance(JavaSparkContext jsc) {
if (instance == null) {
synchronized (JavaWordBlacklist.class) {
if (instance == null)
{ List<String> wordBlacklist = Arrays.asList("a", "b", "c"); instance = jsc.broadcast(wordBlacklist); }
}
}
return instance;
}
}
class JavaDroppedWordsCounter {
private static volatile LongAccumulator instance = null;
public static LongAccumulator getInstance(JavaSparkContext jsc) {
if (instance == null) {
synchronized (JavaDroppedWordsCounter.class) {
if (instance == null)
{ instance = jsc.sc().longAccumulator("WordsInBlacklistCounter"); }
}
}
return instance;
}
}
wordCounts.foreachRDD((rdd, time) -> {
// Get or register the blacklist Broadcast
Broadcast<List<String>> blacklist = JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context()));
// Get or register the droppedWordsCounter Accumulator
LongAccumulator droppedWordsCounter = JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context()));
// Use blacklist to drop words and use droppedWordsCounter to count them
String counts = rdd.filter(wordCount -> {
if (blacklist.value().contains(wordCount._1()))
{ droppedWordsCounter.add(wordCount._2()); return false; }
else
{ return true; }
}).collect().toString();
String output = "Counts at time " + time + " " + counts;
}

Kafka Message Key - byte[] and String Simultaneously

I'm having a very confusing issue with Kafka - specifically trying to obtain the Key of a message.
The key seems to think it's both a String and a byte[]
The following code produces the exception below:
Map<String, Integer> topicCount = new HashMap<>();
topicCount.put(myConsumer.getTopic(), 1);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerStreams = myConsumer.getConsumer().createMessageStreams(topicCount);
List<KafkaStream<byte[], byte[]>> streams = consumerStreams.get(myConsumer.getTopic());
System.out.println("Listening to topic " + myConsumer.getTopic());
for (final KafkaStream stream : streams) {
ConsumerIterator<String, byte[]> it = stream.iterator();
while (it.hasNext()) {
System.out.println("Message received from topic");
MessageAndMetadata<String, byte[]> o = it.next();
Object messageKey = o.key();
System.out.println("messageKey is type: " + messageKey.getClass().getName());
System.out.println("messageKey is type: " + messageKey.getClass().getCanonicalName());
System.out.println("o keyDecoder: " + o.keyDecoder());
System.out.println("Key from message: " + o.key()); //This throws exception - [B cannot be cast to java.lang.String
//System.out.println("Key as String: " + new String(o.key(), StandardCharsets.UTF_8)); //uncomment this compile Exception - no suitable constructor found for String(java.lang.String,java.nio.charset.Charset)
byte[] bytesIn = o.message(); //getting the bytes is fine
System.out.println("MessageAndMetadata: " + o);
///other code cut
}
}
Exception:
Listening to topic MyKafkaTopic
Message received from topic
messageKey is type: [B
messageKey is type: byte[]
o decoder: kafka.serializer.DefaultDecoder#2e0d0acd
[WARNING]
java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.codehaus.mojo.exec.ExecJavaMojo$1.run(ExecJavaMojo.java:293)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassCastException: [B cannot be cast to java.lang.String
at com.foo.bar.KafkaCFS.process(KafkaCFS.java:153)
at com.foo.bar.KafkaCFS.run(KafkaCFS.java:63)
at com.foo.bar.App.main(App.java:90)
... 6 more
Maven:
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.10</artifactId>
<version>0.9.0.1</version>
</dependency>
If I uncomment the System.out line then I cannot even compile:
[ERROR] COMPILATION ERROR :
[INFO] -------------------------------------------------------------
[ERROR] /C:/Dev/main/java/com/foo/bar/KafkaCFS.java:[152,56] no suitable constructor found for String(java.lang.String,java.nio.charset.Charset)
constructor java.lang.String.String(byte[],int) is not applicable
(argument mismatch; java.lang.String cannot be converted to byte[])
How is it that the compiler thinks the Key is a String (which is what I expected it to be) but that runtime it's a byte array?
What can I do to get the Key as a String?
Thanks,
KA.
This did not match! You are declare streams as KafkaStream<byte[], byte[]> and then you expect ConsumerIterator<String, byte[]> it = stream.iterator(); it should be ConsumerIterator<byte[], byte[]> it = stream.iterator(); to match generics. Then you can get o.key() and create a string from it via new String(o.key());
Better off setting KafkaStream generic parameter type is (byte[], byte[]). Try to change code like this:
ConsumerIterator<byte[], byte[]> it = stream.iterator();
while (it.hasNext()) {
String key = new String(it.next().key());
...
}

Union a List of Flume Receivers in Spark Streaming

In order to increase parallelism as recommended in the Spark Streaming Programming guide I'm setting up multiple receivers and trying to union a list of them. This code works as expected:
private JavaDStream<SparkFlumeEvent> getEventsWorking(JavaStreamingContext jssc, List<String> hosts, List<String> ports) {
List<JavaReceiverInputDStream<SparkFlumeEvent>> receivers = new ArrayList<>();
for (String host : hosts) {
for (String port : ports) {
receivers.add(FlumeUtils.createStream(jssc, host, Integer.parseInt(port)));
}
}
JavaDStream<SparkFlumeEvent> unionStreams = receivers.get(0)
.union(receivers.get(1))
.union(receivers.get(2))
.union(receivers.get(3))
.union(receivers.get(4))
.union(receivers.get(5));
return unionStreams;
}
But I don't actually know how many receivers my cluster will have until runtime. When I try to do this in a loop I get an NPE.
private JavaDStream<SparkFlumeEvent> getEventsNotWorking(JavaStreamingContext jssc, List<String> hosts, List<String> ports) {
List<JavaReceiverInputDStream<SparkFlumeEvent>> receivers = new ArrayList<>();
for (String host : hosts) {
for (String port : ports) {
receivers.add(FlumeUtils.createStream(jssc, host, Integer.parseInt(port)));
}
}
JavaDStream<SparkFlumeEvent> unionStreams = null;
for (JavaReceiverInputDStream<SparkFlumeEvent> receiver : receivers) {
if (unionStreams == null) {
unionStreams = receiver;
} else {
unionStreams.union(receiver);
}
}
return unionStreams;
}
ERROR:
16/09/15 17:05:25 ERROR JobScheduler: Error in job generator
java.lang.NullPointerException
at org.apache.spark.streaming.DStreamGraph$$anonfun$getMaxInputStreamRememberDuration$2.apply(DStreamGraph.scala:172)
at org.apache.spark.streaming.DStreamGraph$$anonfun$getMaxInputStreamRememberDuration$2.apply(DStreamGraph.scala:172)
at scala.collection.TraversableOnce$$anonfun$maxBy$1.apply(TraversableOnce.scala:225)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
at scala.collection.IndexedSeqOptimized$class.reduceLeft(IndexedSeqOptimized.scala:68)
at scala.collection.mutable.ArrayBuffer.reduceLeft(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.maxBy(TraversableOnce.scala:225)
at scala.collection.AbstractTraversable.maxBy(Traversable.scala:105)
at org.apache.spark.streaming.DStreamGraph.getMaxInputStreamRememberDuration(DStreamGraph.scala:172)
at org.apache.spark.streaming.scheduler.JobGenerator.clearMetadata(JobGenerator.scala:270)
at org.apache.spark.streaming.scheduler.JobGenerator.org$apache$spark$streaming$scheduler$JobGenerator$$processEvent(JobGenerator.scala:182)
at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:87)
at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:86)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
16/09/15 17:05:25 INFO MemoryStore: ensureFreeSpace(15128) called with
curMem=520144, maxMem=555755765 16/09/15 17:05:25 INFO MemoryStore:
Block broadcast_24 stored as values in memory (estimated size 14.8 KB,
free 529.5 MB) Exception in thread "main"
java.lang.NullPointerException
at org.apache.spark.streaming.DStreamGraph$$anonfun$getMaxInputStreamRememberDuration$2.apply(DStreamGraph.scala:172)
at org.apache.spark.streaming.DStreamGraph$$anonfun$getMaxInputStreamRememberDuration$2.apply(DStreamGraph.scala:172)
at scala.collection.TraversableOnce$$anonfun$maxBy$1.apply(TraversableOnce.scala:225)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
at scala.collection.IndexedSeqOptimized$class.reduceLeft(IndexedSeqOptimized.scala:68)
at scala.collection.mutable.ArrayBuffer.reduceLeft(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.maxBy(TraversableOnce.scala:225)
at scala.collection.AbstractTraversable.maxBy(Traversable.scala:105)
at org.apache.spark.streaming.DStreamGraph.getMaxInputStreamRememberDuration(DStreamGraph.scala:172)
at org.apache.spark.streaming.scheduler.JobGenerator.clearMetadata(JobGenerator.scala:270)
at org.apache.spark.streaming.scheduler.JobGenerator.org$apache$spark$streaming$scheduler$JobGenerator$$processEvent(JobGenerator.scala:182)
at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:87)
at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:86)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
What's the correct way to do this?
Can you please try out the below code, It would solve your problem:
private JavaDStream<SparkFlumeEvent> getEventsNotWorking(JavaStreamingContext jssc, List<String> hosts, List<String> ports) {
List<JavaDStream<SparkFlumeEvent>> receivers = new ArrayList<JavaDStream<SparkFlumeEvent>>();
for (String host : hosts) {
for (String port : ports) {
receivers.add(FlumeUtils.createStream(jssc, host, Integer.parseInt(port)));
}
}
return jssc.union(receivers.get(0), receivers.subList(1, receivers.size()));;
}

Resources