Connecting to spark streaming with external source like MS SQL server and publishing table data to Kafka.
Getting
java.io.NotSerializableException:org.apache.kafka.clients.producer.KafkaProducer
error.
Please find below deails.
**CustomReceiver.sacla**
package com.sparkdemo.app
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
import java.util.List
import java.util.Map
import com.sparkdemo.entity.Inventory
import org.apache.kafka.clients.consumer.{ConsumerRecords, KafkaConsumer}
import java.net.ConnectException
import scala.util.{Try, Success, Failure}
import collection.JavaConversions._
class CustomReceiver(topic: String, kafkaParams: Map[String, Object]) extends Receiver[Inventory](StorageLevel.MEMORY_AND_DISK_2) {
override def onStart(): Unit = {
val dataService = new DataService()
var records: Inventory = dataService.selectAll()
new Thread("Socket Receiver") {
override def run() {
Try {
val consumer = new KafkaConsumer(kafkaParams)
consumer.subscribe(topic)
while (!isStopped && records!=null) {
// store(tokenData)
// tokenData = new DataService().selectAll();
val records = new DataService().selectAll();
store(records)
}
} match {
case e: ConnectException =>
restart("Error connecting to...", e)
case t: Throwable =>
restart("Error receiving data", t)
}
}
}.start()
}
override def onStop(): Unit = {
println("Nothing")
}
}
**DataService.scala**
package com.sparkdemo.app;
import java.sql.Connection
import java.sql.DriverManager
import java.sql.ResultSet
import java.sql.Statement
import java.util._
import scala.collection.JavaConversions._
import com.sparkdemo.entity.Inventory
class DataService {
var connect: Connection = DriverManager.getConnection(
"jdbc:sqlserver://localhost;databaseName=TestDB;user=SA;password=Sqlserver#007")
var statement: Statement = connect.createStatement()
var resultSet: ResultSet = null
var inv: Inventory = new Inventory()
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver")
def selectAll(): Inventory = {
resultSet = statement.executeQuery("select * from Inventory")
while (resultSet.next()) {
val inv: Inventory = new Inventory()
inv.setId(resultSet.getInt("id"))
inv.setName(resultSet.getString("name"))
inv.setQuantity(resultSet.getInt("quantity"))
}
inv
}
}
Scala main class **Stream.scala**
package com.sparkdemo.app
import org.apache.spark.streaming.dstream.DStream
import com.sparkdemo.config.Config
import org.apache.kafka.common.serialization.{ StringDeserializer, StringSerializer }
import org.apache.kafka.clients.producer.{ ProducerRecord, KafkaProducer }
import java.util.Properties
import collection.JavaConversions._
import com.sparkdemo.entity.Inventory
object Stream extends Serializable{
def main(args: Array[String]) {
import org.apache.spark.streaming._
def getKafkaParams: Map[String, Object] = {
Map[String, Object](
"auto.offset.reset" -> "earliest",
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "group3")
}
val properties = new Properties()
properties.put("bootstrap.servers", "localhost:9092")
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val topic1 = "topic1"
val topic2 = "topic2"
val producer :KafkaProducer[String, Object] = new KafkaProducer(properties)
val ssc = Config.configReceiver()
val stream = ssc.receiverStream(new CustomReceiver(topic1, getKafkaParams))
stream.map(Inventory=>producer.send(new ProducerRecord[String,Object](topic2,Inventory)))
stream.print()
ssc.start()
ssc.awaitTermination()
}
}
Entity class: **Inventory.scala**
package com.sparkdemo.entity
import scala.beans.{BeanProperty}
class Inventory extends Serializable{
#BeanProperty
var id: Int = _
#BeanProperty
var name: String = _
#BeanProperty
var quantity: Int = _
}
Error:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2287)
at org.apache.spark.streaming.dstream.DStream$$anonfun$map$1.apply(DStream.scala:547)
at org.apache.spark.streaming.dstream.DStream$$anonfun$map$1.apply(DStream.scala:547)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:701)
at org.apache.spark.streaming.StreamingContext.withScope(StreamingContext.scala:265)
at org.apache.spark.streaming.dstream.DStream.map(DStream.scala:546)
at com.sparkdemo.app.Stream$.main(Stream.scala:36)
at com.sparkdemo.app.Stream.main(Stream.scala)
Caused by: java.io.NotSerializableException: org.apache.kafka.clients.producer.KafkaProducer
Serialization stack:
- object not serializable (class: org.apache.kafka.clients.producer.KafkaProducer, value: org.apache.kafka.clients.producer.KafkaProducer#557286ad)
- field (class: com.sparkdemo.app.Stream$$anonfun$main$1, name: producer$1, type: class org.apache.kafka.clients.producer.KafkaProducer)
- object (class com.sparkdemo.app.Stream$$anonfun$main$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
... 12 more
You have ran issue where kafkaproducer is sent unintentionally to executor because of below code
stream.map(Inventory=>producer.send(new ProducerRecordString,Object))
You can mappartitions and create producer in mappartitions so that it is not shipped to executors.
The problem is the type of Serializer you are using for Object type value.
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
Please write a Serializer to read the Object type values.You can refer below link
Send Custom Java Objects to Kafka Topic
Related
Google is deprecating Android AsyncTask API in Android 11
Google is deprecating Android AsyncTask API in Android 11 and suggesting to use java.util.concurrent instead
My question is that what should be proper replacement of the code snippet shown below using java.util.concurrent
package gaspriceinegypt.gaspriceegypt.gaspriceinegypt
import android.os.AsyncTask
import com.google.android.gms.maps.GoogleMap
import gaspriceinegypt.gaspriceegypt.gaspriceinegypt.DownloadUrl
import org.json.JSONObject
import org.json.JSONArray
import com.google.android.gms.maps.model.LatLng
import com.google.android.gms.maps.model.MarkerOptions
import com.google.android.gms.maps.CameraUpdateFactory
import org.json.JSONException
import java.io.IOException
class FetchData : AsyncTask<Any?, String?, String?>() {
var googleNeatByPlacesData: String? = null
var googleMap: GoogleMap? = null
var url: String? = null
override fun onPostExecute(s: String?) {
try {
val jsonObject = JSONObject(s)
val jsonArray = jsonObject.getJSONArray("results")
for (i in 0 until jsonArray.length()) {
val jsonObject1 = jsonArray.getJSONObject(i)
val getLocation = jsonObject1.getJSONObject("geometry").getJSONObject("location")
val lat = getLocation.getString("lat")
val lng = getLocation.getString("lng")
val getName = jsonArray.getJSONObject(i)
val name = getName.getString("name")
val latLng = LatLng(lat.toDouble(), lng.toDouble())
val markerOptions = MarkerOptions()
markerOptions.title(name)
markerOptions.position(latLng)
googleMap!!.addMarker(markerOptions)
googleMap!!.moveCamera(CameraUpdateFactory.newLatLngZoom(latLng, 15f))
}
} catch (e: JSONException) {
e.printStackTrace()
}
}
override fun doInBackground(vararg objects: Any?): String? {
try {
googleMap = objects[0] as GoogleMap
url = objects[1] as String
val downloadUrl = DownloadUrl()
googleNeatByPlacesData = downloadUrl.retireveUrl(url)
} catch (e: IOException) {
e.printStackTrace()
}
return googleNeatByPlacesData
}
}
As you know Google is deprecating Android AsyncTask API in Android 11.
Alternatives in my mind are:
kotlin coroutines
use self Executor interface implemetion which is a Java standard lib interface
WorkManager which is a android lib
I'm new to Apache Spark and I'm learning how to use it in java. I would like to define and use an user defined function (udf) and I get the scala.MatchError by returning a java.util.HashMap.
Here is my code for extracting hashtags from a tweets dataset and adding a new column with a map of the hashtag and it's number of occurrences in the respective tweet:
// Open spark session
SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("TwitterAnalyticsExample").getOrCreate();
// Load training data
Dataset<Row> twitterData = sparkSession.read().format("json").load(inputFilePath);
UDF1 extractHashtags = new UDF1<String, Map<String, Integer>>() {
#Override
public Map<String, Integer> call(String tweet) throws Exception {
Map<String, Integer> result = new HashMap<>();
Pattern pattern = Pattern.compile("#\\w*");
Matcher matcher = pattern.matcher(tweet);
while (matcher.find()) {
result.merge(matcher.group(), 1, (v1, v2) -> v1 + v2);
}
return result;
}
};
sparkSession.sqlContext().udf().register("extractHashtags", extractHashtags, DataTypes.StringType);
twitterData.limit(50).select(callUDF("extractHashtags", col("text"))).show(20);
and following imports:
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.types.DataTypes;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Any hint, what am I doing wrong? Is the return type java.util.Map a problem for UDF? What could I use instead?
I want to use sparksql to only update one of the fields in the ElasticSearch. But it covers it.
Can anyone please explain how can it be done?
Below is my code:
import com.bjvca.utils.PVConstant
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.SQLContext._
import org.elasticsearch.spark.sql._
object DF2ESTest {
def main(args: Array[String]): Unit = {
val conf = conf...
val sc = new SparkContext(conf)
val sqlContext: SQLContext = new SQLContext(sc)
import sqlContext.implicits._
val person = sc.textFile("F:\\mrdata\\person\\input\\person.txt")
.map(_.split(","))
.map(p => Person(p(0), p(0), p(1).trim.toInt)) //first write
// .map(p => Person(p(0),p(0))) //want to update name
.toDF()
person.saveToEs("person/person", Map("es.mapping.id" -> "id"))
}
}
case class Person(id: String, name: String, age: Int)
//case class Person(id:String,name: String)
first:write data to elasticsearch.
second: i want update only name to es,but it covers old data than age is disap
I am using Spark 1.5.0 from CDH 5.5.2 distro. I switched to Scala 2.10.5 from 2.10.4. I am using the following code for UDAF. Is this somehow String vs UTF8String issue? If yes, any help will be greatly appreciated.
object GroupConcat extends UserDefinedAggregateFunction {
def inputSchema = new StructType().add("x", StringType)
def bufferSchema = new StructType().add("buff", ArrayType(StringType))
def dataType = StringType
def deterministic = true
def initialize(buffer: MutableAggregationBuffer) = {
buffer.update(0, ArrayBuffer.empty[String])
}
def update(buffer: MutableAggregationBuffer, input: Row) = {
if (!input.isNullAt(0))
buffer.update(0, buffer.getSeq[String](0) :+ input.getString(0))
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row) = {
buffer1.update(0, buffer1.getSeq[String](0) ++ buffer2.getSeq[String](0))
}
def evaluate(buffer: Row) = UTF8String.fromString(
buffer.getSeq[String](0).mkString(","))
}
However, I get this error message at runtime:
Exception in thread "main" java.lang.InternalError: Malformed class name
at java.lang.Class.getSimpleName(Class.java:1190)
at org.apache.spark.sql.execution.aggregate.ScalaUDAF.toString(udaf.scala:464)
at java.lang.String.valueOf(String.java:2847)
at java.lang.StringBuilder.append(StringBuilder.java:128)
at scala.StringContext.standardInterpolator(StringContext.scala:122)
at scala.StringContext.s(StringContext.scala:90)
at org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2.toString(interfaces.scala:96)
at org.apache.spark.sql.catalyst.expressions.Expression.prettyString(Expression.scala:174)
at org.apache.spark.sql.GroupedData$$anonfun$1.apply(GroupedData.scala:86)
at org.apache.spark.sql.GroupedData$$anonfun$1.apply(GroupedData.scala:80)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at org.apache.spark.sql.GroupedData.toDF(GroupedData.scala:80)
at org.apache.spark.sql.GroupedData.agg(GroupedData.scala:227)
I received the same exception because my object that extends UserDefinedAggregateFunction was inside of another function.
Change this:
object Driver {
def main(args: Array[String]) {
object GroupConcat extends UserDefinedAggregateFunction {
...
}
}
}
To this:
object Driver {
def main(args: Array[String]) {
...
}
object GroupConcat extends UserDefinedAggregateFunction {
...
}
}
I ran into this as a conflict with packages that I was importing. If you are importing anything then try testing on a spark shell with nothing imported.
When you define your UDAF check what the name being returned looks like. It should be something like
FractionOfDayCoverage: org.apache.spark.sql.expressions.UserDefinedAggregateFunction{def dataType: org.apache.spark.sql.types.DoubleType.type; def evaluate(buffer: org.apache.spark.sql.Row): Double} = $anon$1#27506b6d
that $anon$1#27506b6d at the end is a reasonable name that will work. When I imported the conflicting package the name returned was 3 times as long. Here is an example:
$$$$bec6d1991b88c272b3efac29d720f546$$$$anon$1#6886601d
I was trying to run a sample spark streaming code. but I get this error:
16/06/02 15:25:42 ERROR streaming.StreamingContext: Error starting the context, marking it as stopped
java.lang.IllegalArgumentException: requirement failed: No output operations registered, so nothing to execute
at scala.Predef$.require(Predef.scala:233)
at org.apache.spark.streaming.DStreamGraph.validate(DStreamGraph.scala:161)
at org.apache.spark.streaming.StreamingContext.validate(StreamingContext.scala:542)
at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:601)
at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:600)
at org.apache.spark.streaming.api.java.JavaStreamingContext.start(JavaStreamingContext.scala:624)
at com.streams.spark_consumer.SparkConsumer.main(SparkConsumer.java:56)
Exception in thread "main" java.lang.IllegalArgumentException: requirement failed: No output operations registered, so nothing to execute
at scala.Predef$.require(Predef.scala:233)
at org.apache.spark.streaming.DStreamGraph.validate(DStreamGraph.scala:161)
at org.apache.spark.streaming.StreamingContext.validate(StreamingContext.scala:542)
at org.apache.spark.streaming.StreamingContext.liftedTree1$1(StreamingContext.scala:601)
at org.apache.spark.streaming.StreamingContext.start(StreamingContext.scala:600)
at org.apache.spark.streaming.api.java.JavaStreamingContext.start(JavaStreamingContext.scala:624)
at com.streams.spark_consumer.SparkConsumer.main(SparkConsumer.java:56)
My code is given below. I know there are a few unused imports, because I was doing something else and getting the same error so I modified the same code to run the sample program given on the spark streaming website:
package com.streams.spark_consumer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import scala.Tuple2;
import kafka.serializer.StringDecoder;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.api.java.*;
import org.apache.spark.streaming.kafka.KafkaUtils;
import org.apache.spark.streaming.Durations;
import org.apache.spark.api.java.JavaSparkContext;
public class SparkConsumer {
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) throws Exception {
System.out.println("Han chal raha hai"); //just to know if this part of the code is executed
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));
System.out.println("Han bola na chal raha hau chutiye 1"); //just to know if this part of the code is executed
JavaReceiverInputDStream<String> lines = jssc.socketTextStream("localhost", 9999);
JavaDStream<String> words = lines.flatMap(
new FlatMapFunction<String, String>() {
public Iterable<String> call(String x) {
return Arrays.asList(x.split(" "));
}
});
JavaPairDStream<String, Integer> pairs = words.mapToPair(
new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String s) {
return new Tuple2<String, Integer>(s, 1);
}
});
JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(
new Function2<Integer, Integer, Integer>() {
public Integer call(Integer i1, Integer i2) {
return i1 + i2;
}
});
jssc.start();
jssc.awaitTermination();
}
}
Can anybody help me out with this?
I am using local master, even then I have tried starting a master and stoping a master (also slaves), I didn't know why that might help but just in case, I have already tried that.
According to Spark documentation
Since the output operations actually allow the transformed data to be consumed by external systems, they trigger the actual execution of all the DStream transformations (similar to actions for RDDs).
So use any of the output operations after your tranformations.
print()
foreachRDD(func)
saveAsObjectFiles(prefix, [suffix])
saveAsTextFiles(prefix, [suffix])
saveAsHadoopFiles(prefix, [suffix])