While running the following spark mllib on local mode with scala 2.12.3 , encountered the following error lambda not serialazable
Any inputs would be much appreciated ?
(Moving onto scala 2.11 is not an option for me) Can you please let me know what can i do to avoid this issue? Thankyou
import java.io.FileWriter
import org.apache.spark.SparkConf
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.TimestampType
import java.util.concurrent.atomic.AtomicBoolean
object MLAnalyzer {
val conf = new SparkConf().setMaster("local[2]").set("deploy-mode", "client").set("spark.driver.bindAddress", "127.0.0.1")
.set("spark.broadcast.compress", "false")
.setAppName("local-spark-kafka-consumer-client")
val spark = SparkSession
.builder()
.config(conf)
.getOrCreate()
def main(args: Array[String]): Unit = {
process
}
def process():Unit= {
// training data
val filePath = "/home/vagrant/Desktop/Workspaces/SparkMachineLearning/sparkML/src/main/resources/train_pooling.csv"
val modelPath = "file:///home/vagrant/Downloads/medium-articles-master/titanic_spark/training_batch/src/main/resources/poolSessionModelRecent.model"
val schema = StructType(
Array(
StructField("PACKAGE_KEY", StringType),
StructField("MOST_IDLE", IntegerType),
StructField("MAX_WAIT", IntegerType),
StructField("IDLE_COUNT", IntegerType),
StructField("APPLICATION", StringType),
StructField("LONGEST_WAIT", IntegerType),
StructField("TIMEOUTS", IntegerType),
StructField("LAST_ACCESS", TimestampType),
StructField("MOST_ACTIVE", IntegerType),
StructField("MAX_ACTIVE", IntegerType),
StructField("MAX_IDLE", IntegerType),
StructField("ACTIVE_COUNT", IntegerType),
StructField("FACTOR_LOAD", DoubleType)))
while (true) {
Thread.sleep(100)
// read the raw data
var df_raw = spark
.read
.option("header", "true")
// .option("inferSchema","true")
.schema(schema)
.csv(filePath)
df_raw = df_raw.drop(df_raw.col("PACKAGE_KEY"))
df_raw = df_raw.drop(df_raw.col("MOST_IDLE"))
df_raw = df_raw.drop(df_raw.col("MAX_IDLE"))
df_raw = df_raw.drop(df_raw.col("MOST_ACTIVE"))
df_raw = df_raw.drop(df_raw.col("LAST_ACCESS"))
df_raw = df_raw.drop(df_raw.col("APPLICATION"))
df_raw = df_raw.drop(df_raw.col("MAX_WAIT"))
// fill all na values with 0
val df = df_raw.na.fill(0)
val packageKeyIndexer = new StringIndexer()
.setInputCol("PACKAGE_KEY")
.setOutputCol("PackageIndex")
.setHandleInvalid("keep")
// create the feature vector
val vectorAssembler = new VectorAssembler()
.setInputCols(Array("IDLE_COUNT", "TIMEOUTS", "ACTIVE_COUNT" /*, "TOTAL_REQUEST_COUNT"*/ ))
.setOutputCol("features_intermediate")
import org.apache.spark.ml.feature.StandardScaler
val scaler = new StandardScaler().setWithMean(true).setWithStd(true).setInputCol("features_intermediate").setOutputCol("features")
var pipeline: Pipeline = null
// if (lr1 == null) {
val lr =
new LinearRegression()
.setMaxIter(100)
.setRegParam(0.1)
.setElasticNetParam(0.8)
//.setFeaturesCol("features") // setting features column
.setLabelCol("FACTOR_LOAD") // setting label column
// create the pipeline with the steps
pipeline = new Pipeline().setStages(Array( /*genderIndexer, cabinIndexer, embarkedIndexer,*/ vectorAssembler, scaler, lr))
// create the model following the pipeline steps
val cvModel = pipeline.fit(df)
// save the model
cvModel.write.overwrite.save(modelPath)
var testschema = StructType(
Array(
// StructField("PACKAGE_KEY", StringType),
StructField("IDLE_COUNT", IntegerType),
StructField("TIMEOUTS", IntegerType),
StructField("ACTIVE_COUNT", IntegerType)))
val df_raw1 = spark
.read
// .option("header", "true")
.schema(testschema)
.csv("/home/vagrant/Desktop/Workspaces/SparkMachineLearning/sparkML/src/main/resources/test_pooling.csv")
// fill all na values with 0
val df1 = df_raw1.na.fill(0)
val evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol("prediction")
var rmse = evaluator.evaluate(cvModel.transform(df1))
import org.apache.spark.sql.functions._
import spark.implicits._
val extracted = cvModel.transform(df1)
val prediction = extracted.select("prediction").map(r => r(0).asInstanceOf[Double]).collect()
if (prediction != null && prediction.length > 0) {
val avg = prediction.sum / prediction.length
val pw: FileWriter = new FileWriter("/home/vagrant/Desktop/Workspaces/SparkMachineLearning/sparkML/src/main/resources/result.csv");
pw.append(avg.toString)
pw.flush()
pw.close()
println("completed modelling process")
} else {
//do nothing
}
}
}
}
gives me following error
Caused by: java.io.NotSerializableException: scala.runtime.LazyRef
Serialization stack:
- object not serializable (class: scala.runtime.LazyRef, value: LazyRef thunk)
- element of array (index: 2)
- array (class [Ljava.lang.Object;, size 3)
- field (class: java.lang.invoke.SerializedLambda, name: capturedArgs, type: class [Ljava.lang.Object;)
- object (class java.lang.invoke.SerializedLambda, SerializedLambda[capturingClass=class org.apache.spark.sql.catalyst.expressions.ScalaUDF, functionalInterfaceMethod=scala/Function1.apply:(Ljava/lang/Object;)Ljava/lang/Object;, implementation=invokeStatic org/apache/spark/sql/catalyst/expressions/ScalaUDF.$anonfun$f$2:(Lscala/Function1;Lorg/apache/spark/sql/catalyst/expressions/Expression;Lscala/runtime/LazyRef;Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;, instantiatedMethodType=(Lorg/apache/spark/sql/catalyst/InternalRow;)Ljava/lang/Object;, numCaptured=3])
- writeReplace data (class: java.lang.invoke.SerializedLambda)
- object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF$$Lambda$2280/878458383, org.apache.spark.sql.catalyst.expressions.ScalaUDF$$Lambda$2280/878458383#65af23c0)
- field (class: org.apache.spark.sql.catalyst.expressions.ScalaUDF, name: f, type: interface scala.Function1)
- object (class org.apache.spark.sql.catalyst.expressions.ScalaUDF, UDF(named_struct(IDLE_COUNT_double_vecAssembler_bc4ee3d99e56, cast(coalesce(IDLE_COUNT#1732, 0) as double), TIMEOUTS_double_vecAssembler_bc4ee3d99e56, cast(coalesce(TIMEOUTS#1735, 0) as double), ACTIVE_COUNT_double_vecAssembler_bc4ee3d99e56, cast(coalesce(ACTIVE_COUNT#1740, 0) as double))))
- field (class: org.apache.spark.sql.catalyst.expressions.Alias, name: child, type: class org.apache.spark.sql.catalyst.expressions.Expression)
- object (class org.apache.spark.sql.catalyst.expressions.Alias, UDF(named_struct(IDLE_COUNT_double_vecAssembler_bc4ee3d99e56, cast(coalesce(IDLE_COUNT#1732, 0) as double), TIMEOUTS_double_vecAssembler_bc4ee3d99e56, cast(coalesce(TIMEOUTS#1735, 0) as double), ACTIVE_COUNT_double_vecAssembler_bc4ee3d99e56, cast(coalesce(ACTIVE_COUNT#1740, 0) as double))) AS features_intermediate#1839)
- element of array (index: 0)
Upgrading to Scala 2.12.8 solved the issue. Not sure about the rootcause though.
Related
I am trying to stream twitter data using Apache Spark in Intellij however when i use the function coalesce , it says that it cannot resolve symbol coalesce. Here is my main code:
val spark = SparkSession.builder().appName("twitterStream").master("local[*]").getOrCreate()
import spark.implicits._
val sc: SparkContext = spark.sparkContext
val streamContext = new StreamingContext(sc, Seconds(5))
val filters = Array("Singapore")
val filtered = TwitterUtils.createStream(streamContext, None, filters)
val englishTweets = filtered.filter(_.getLang() == "en")
//englishTweets.print()
englishTweets.foreachRDD{rdd =>
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
val tweets = rdd.map( field =>
(
field.getId,
field.getUser.getScreenName,
field.getCreatedAt.toInstant.toString,
field.getText.toLowerCase.split(" ").filter(_.matches("^[a-zA-Z0-9 ]+$")).fold("")((a, b) => a + " " + b).trim,
sentiment(field.getText)
)
)
val tweetsdf = tweets.toDF("userID", "user", "createdAt", "text", "sentimentType")
tweetsdf.printSchema()
tweetsdf.show(false)
}.coalesce(1).write.csv("hdfs://localhost:9000/usr/sparkApp/test/testing.csv")
I have tried with my own dataset, and I have read a dataset and while writing I have applied coalesce function and it is giving results, please refer to this it may help you.
import org.apache.spark.sql.SparkSession
import com.spark.Rdd.DriverProgram
import org.apache.log4j.{ Logger, Level }
import org.apache.spark.sql.SaveMode
import java.sql.Date
object JsonDataDF {
System.setProperty("hadoop.home.dir", "C:\\hadoop");
System.setProperty("hadoop.home.dir", "C:\\hadoop"); // This is the system property which is useful to find the winutils.exe
Logger.getLogger("org").setLevel(Level.WARN) // This will remove Logs
case class AOK(appDate:Date, arr:String, base:String, Comments:String)
val dp = new DriverProgram
val spark = dp.getSparkSession()
def main(args : Array[String]): Unit = {
import spark.implicits._
val jsonDf = spark.read.option("multiline", "true").json("C:\\Users\\34979\\Desktop\\Work\\Datasets\\JSONdata.txt").as[AOK]
jsonDf.coalesce(1) // Refer Here
.write
.mode(SaveMode.Overwrite)
.option("header", "true")
.format("csv")
.save("C:\\Users\\34979\\Desktop\\Work\\Datasets\\JsonToCsv")
}
}
I'm using the approach given here to flatten a DataFrame in Spark SQL. Here is my code:
package com.acme.etl.xml
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, SparkSession}
object RuntimeError { def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("FlattenSchema").getOrCreate()
val rowTag = "idocData"
val dataFrameReader =
spark.read
.option("rowTag", rowTag)
val xmlUri = "bad_011_1.xml"
val df =
dataFrameReader
.format("xml")
.load(xmlUri)
val schema: StructType = df.schema
val columns: Array[Column] = flattenSchema(schema)
val df2 = df.select(columns: _*)
}
def flattenSchema(schema: StructType, prefix: String = null) : Array[Column] = {
schema.fields.flatMap(f => {
val colName: String = if (prefix == null) f.name else prefix + "." + f.name
val dataType = f.dataType
dataType match {
case st: StructType => flattenSchema(st, colName)
case _: StringType => Array(new org.apache.spark.sql.Column(colName))
case _: LongType => Array(new org.apache.spark.sql.Column(colName))
case _: DoubleType => Array(new org.apache.spark.sql.Column(colName))
case arrayType: ArrayType => arrayType.elementType match {
case structType: StructType => flattenSchema(structType, colName)
}
case _ => Array(new org.apache.spark.sql.Column(colName))
}
})
}
}
Much of the time, this works fine. But for the XML given below:
<Receive xmlns="http://Microsoft.LobServices.Sap/2007/03/Idoc/3/ORDERS05/ZORDERS5/702/Receive">
<idocData>
<E2EDP01008GRP xmlns="http://Microsoft.LobServices.Sap/2007/03/Types/Idoc/3/ORDERS05/ZORDERS5/702">
<E2EDPT1001GRP>
<E2EDPT2001>
<DATAHEADERCOLUMN_DOCNUM>0000000141036013</DATAHEADERCOLUMN_DOCNUM>
</E2EDPT2001>
<E2EDPT2001>
<DATAHEADERCOLUMN_DOCNUM>0000000141036013</DATAHEADERCOLUMN_DOCNUM>
</E2EDPT2001>
</E2EDPT1001GRP>
</E2EDP01008GRP>
<E2EDP01008GRP xmlns="http://Microsoft.LobServices.Sap/2007/03/Types/Idoc/3/ORDERS05/ZORDERS5/702">
</E2EDP01008GRP>
</idocData>
</Receive>
this exception occurs:
Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot resolve '`E2EDP01008GRP`.`E2EDPT1001GRP`.`E2EDPT2001`['DATAHEADERCOLUMN_DOCNUM']' due to data type mismatch: argument 2 requires integral type, however, ''DATAHEADERCOLUMN_DOCNUM'' is of string type.;;
'Project [E2EDP01008GRP#0.E2EDPT1001GRP.E2EDPT2001[DATAHEADERCOLUMN_DOCNUM] AS DATAHEADERCOLUMN_DOCNUM#3, E2EDP01008GRP#0._VALUE AS _VALUE#4, E2EDP01008GRP#0._xmlns AS _xmlns#5]
+- Relation[E2EDP01008GRP#0] XmlRelation(<function0>,Some(/Users/paulreiners/s3/cdi-events-partition-staging/content_acme_purchase_order_json_v1/bad_011_1.xml),Map(rowtag -> idocData, path -> /Users/paulreiners/s3/cdi-events-partition-staging/content_acme_purchase_order_json_v1/bad_011_1.xml),null)
What is causing this?
Your document contains a multi-valued array so you can't flatten it completely in one pass since you can't give both elements of the array the same column name.
Also, it's usually a bad idea to use a dot within a column name since it can easily confuse the Spark parser and will need to be escaped at all time.
The usual way to flatten such a dataset is to create new rows for each element of the array.
You can use the explode function to do this but you will need to recursively call your flatten operation because explode can't be nested.
The following code works as expected, using '_' instead of '.' as column name separator:
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, SparkSession}
import org.apache.spark.sql.{Dataset, Row}
object RuntimeError {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("FlattenSchema").getOrCreate()
val rowTag = "idocData"
val dataFrameReader = spark.read.option("rowTag", rowTag)
val xmlUri = "bad_011_1.xml"
val df = dataFrameReader.format("xml").load(xmlUri)
val df2 = flatten(df)
}
def flatten(df: Dataset[Row], prefixSeparator: String = "_") : Dataset[Row] = {
import org.apache.spark.sql.functions.{col,explode}
def mustFlatten(sc: StructType): Boolean =
sc.fields.exists(f => f.dataType.isInstanceOf[ArrayType] || f.dataType.isInstanceOf[StructType])
def flattenAndExplodeOne(sc: StructType, parent: Column = null, prefix: String = null, cols: Array[(DataType,Column)] = Array[(DataType,Column)]()): Array[(DataType,Column)] = {
val res = sc.fields.foldLeft(cols)( (columns, f) => {
val my_col = if (parent == null) col(f.name) else parent.getItem(f.name)
val flat_name = if (prefix == null) f.name else s"${prefix}${prefixSeparator}${f.name}"
f.dataType match {
case st: StructType => flattenAndExplodeOne(st, my_col, flat_name, columns)
case dt: ArrayType => {
if (columns.exists(_._1.isInstanceOf[ArrayType])) {
columns :+ ((dt, my_col.as(flat_name)))
} else {
columns :+ ((dt, explode(my_col).as(flat_name)))
}
}
case dt => columns :+ ((dt, my_col.as(flat_name)))
}
})
res
}
var flatDf = df
while (mustFlatten(flatDf.schema)) {
val newColumns = flattenAndExplodeOne(flatDf.schema, null, null).map(_._2)
flatDf = flatDf.select(newColumns:_*)
}
flatDf
}
}
The resulting df2 has the following schema and data:
df2.printSchema
root
|-- E2EDP01008GRP_E2EDPT1001GRP_E2EDPT2001_DATAHEADERCOLUMN_DOCNUM: long (nullable = true)
|-- E2EDP01008GRP__xmlns: string (nullable = true)
df2.show(true)
+--------------------------------------------------------------+--------------------+
|E2EDP01008GRP_E2EDPT1001GRP_E2EDPT2001_DATAHEADERCOLUMN_DOCNUM|E2EDP01008GRP__xmlns|
+--------------------------------------------------------------+--------------------+
| 141036013|http://Microsoft....|
| 141036013|http://Microsoft....|
+--------------------------------------------------------------+--------------------+
I am trying to pass a struct in spark to udf. It is changing the field names and renaming to the column position. How do I fix it?
object TestCSV {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("localTest").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val inputData = sqlContext.read.format("com.databricks.spark.csv")
.option("delimiter","|")
.option("header", "true")
.load("test.csv")
inputData.printSchema()
inputData.show()
val groupedData = inputData.withColumn("name",struct(inputData("firstname"),inputData("lastname")))
val udfApply = groupedData.withColumn("newName",processName(groupedData("name")))
udfApply.show()
}
def processName = udf((input:Row) =>{
println(input)
println(input.schema)
Map("firstName" -> input.getAs[String]("firstname"), "lastName" -> input.getAs[String]("lastname"))
})
}
Output:
root
|-- id: string (nullable = true)
|-- firstname: string (nullable = true)
|-- lastname: string (nullable = true)
+---+---------+--------+
| id|firstname|lastname|
+---+---------+--------+
| 1| jack| reacher|
| 2| john| Doe|
+---+---------+--------+
Error:
[jack,reacher]
StructType(StructField(i[1],StringType,true), > StructField(i[2],StringType,true))
17/03/08 09:45:35 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 2)
java.lang.IllegalArgumentException: Field "firstname" does not exist.
What you are encountering is really strange. After playing around a bit I finally figured out that it may be related to a problem with the optimizer engine. It seems that the problem is not the UDF but the struct function.
I get it to work (Spark 1.6.3) when I cache the groupedData, without caching I get your reported exception:
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
object Demo {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setAppName("Demo").setMaster("local[1]"))
val sqlContext = new HiveContext(sc)
import sqlContext.implicits._
import org.apache.spark.sql.functions._
def processName = udf((input: Row) => {
Map("firstName" -> input.getAs[String]("firstname"), "lastName" -> input.getAs[String]("lastname"))
})
val inputData =
sc.parallelize(
Seq(("1", "Kevin", "Costner"))
).toDF("id", "firstname", "lastname")
val groupedData = inputData.withColumn("name", struct(inputData("firstname"), inputData("lastname")))
.cache() // does not work without cache
val udfApply = groupedData.withColumn("newName", processName(groupedData("name")))
udfApply.show()
}
}
Alternatively you can use the RDD API to make your struct, but this is not really nice:
case class Name(firstname:String,lastname:String) // define outside main
val groupedData = inputData.rdd
.map{r =>
(r.getAs[String]("id"),
Name(
r.getAs[String]("firstname"),
r.getAs[String]("lastname")
)
)
}
.toDF("id","name")
I am reading data from RDD of Element of type com.google.gson.JsonObject. Trying to convert that into DataSet but no clue how to do this.
import com.google.gson.{JsonParser}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.{SparkSession}
object tmp {
class people(name: String, age: Long, phone: String)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").getOrCreate()
val sc = spark.sparkContext
val parser = new JsonParser();
val jsonObject1 = parser.parse("""{"name":"abc","age":23,"phone":"0208"}""").getAsJsonObject()
val jsonObject2 = parser.parse("""{"name":"xyz","age":33}""").getAsJsonObject()
val PairRDD = sc.parallelize(List(
(new LongWritable(1l), jsonObject1),
(new LongWritable(2l), jsonObject2)
))
val rdd1 =PairRDD.map(element => element._2)
import spark.implicits._
//How to create Dataset as schema People from rdd1?
}
}
Even trying to print rdd1 elements throws
object not serializable (class: org.apache.hadoop.io.LongWritable, value: 1)
- field (class: scala.Tuple2, name: _1, type: class java.lang.Object)
- object (class scala.Tuple2, (1,{"name":"abc","age":23,"phone":"0208"}))
Basically I get this RDD[LongWritable,JsonParser] from BigQuery table which I want to convert to Dataset so I can apply SQL for transformation.
I've left phone in the second record null intentionally, BigQuery return nothing for that element with null value.
Thanks for the clarification. You need to register the class as Serializable in kryo. The following show work. I am running in spark-shell so had to destroy the old context and create a new spark context with a config that included the registered Kryo Classes
import com.google.gson.{JsonParser}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.SparkContext
sc.stop()
val conf = sc.getConf
conf.registerKryoClasses( Array(classOf[LongWritable], classOf[JsonParser] ))
conf.get("spark.kryo.classesToRegister")
val sc = new SparkContext(conf)
val parser = new JsonParser();
val jsonObject1 = parser.parse("""{"name":"abc","age":23,"phone":"0208"}""").getAsJsonObject()
val jsonObject2 = parser.parse("""{"name":"xyz","age":33}""").getAsJsonObject()
val pairRDD = sc.parallelize(List(
(new LongWritable(1l), jsonObject1),
(new LongWritable(2l), jsonObject2)
))
val rdd = pairRDD.map(element => element._2)
rdd.collect()
// res9: Array[com.google.gson.JsonObject] = Array({"name":"abc","age":23,"phone":"0208"}, {"name":"xyz","age":33})
val jsonstrs = rdd.map(e=>e.toString).collect()
val df = spark.read.json( sc.parallelize(jsonstrs) )
df.printSchema
// root
// |-- age: long (nullable = true)
// |-- name: string (nullable = true)
// |-- phone: string (nullable = true)
code below:
def main(args: Array[String]) {
val sc = new SparkContext
val sec = Seconds(3)
val ssc = new StreamingContext(sc, sec)
ssc.checkpoint("./checkpoint")
val rdd = ssc.sparkContext.parallelize(Seq("a","b","c"))
val inputDStream = new ConstantInputDStream(ssc, rdd)
inputDStream.transform(rdd => {
val buf = ListBuffer[String]()
buf += "1"
buf += "2"
buf += "3"
val other_rdd = ssc.sparkContext.parallelize(buf) // create a new rdd
rdd.union(other_rdd)
}).print()
ssc.start()
ssc.awaitTermination()
}
and throw exception:
java.io.NotSerializableException: DStream checkpointing has been enabled but the DStreams with their functions are not serializable
org.apache.spark.streaming.StreamingContext
Serialization stack:
- object not serializable (class: org.apache.spark.streaming.StreamingContext, value: org.apache.spark.streaming.StreamingContext#5626e185)
- field (class: com.mirrtalk.Test$$anonfun$main$1, name: ssc$1, type: class org.apache.spark.streaming.StreamingContext)
- object (class com.mirrtalk.Test$$anonfun$main$1, <function1>)
- field (class: org.apache.spark.streaming.dstream.DStream$$anonfun$transform$1$$anonfun$apply$21, name: cleanedF$2, type: interface scala.Function1)
- object (class org.apache.spark.streaming.dstream.DStream$$anonfun$transform$1$$anonfun$apply$21, <function2>)
- field (class: org.apache.spark.streaming.dstream.DStream$$anonfun$transform$2$$anonfun$5, name: cleanedF$3, type: interface scala.Function2)
- object (class org.apache.spark.streaming.dstream.DStream$$anonfun$transform$2$$anonfun$5, <function2>)
- field (class: org.apache.spark.streaming.dstream.TransformedDStream, name: transformFunc, type: interface scala.Function2)
when I remove code ssc.checkpoint("./checkpoint"), the application can work well, but I need enable checkpoint.
how to fix this issue when enable checkpoint?
You can move context initialization and configuration tasks outside main:
object App {
val sc = new SparkContext(new SparkConf().setAppName("foo").setMaster("local"))
val sec = Seconds(3)
val ssc = new StreamingContext(sc, sec)
ssc.checkpoint("./checkpoint") // enable checkpoint
def main(args: Array[String]) {
val rdd = ssc.sparkContext.parallelize(Seq("a", "b", "c"))
val inputDStream = new ConstantInputDStream(ssc, rdd)
inputDStream.transform(rdd => {
val buf = ListBuffer[String]()
buf += "1"
buf += "2"
buf += "3"
val other_rdd = ssc.sparkContext.parallelize(buf)
rdd.union(other_rdd) // I want to union other RDD
}).print()
ssc.start()
ssc.awaitTermination()
}
}