how can sparksql update to elasticsearch - apache-spark

I want to use sparksql to only update one of the fields in the ElasticSearch. But it covers it.
Can anyone please explain how can it be done?
Below is my code:
import com.bjvca.utils.PVConstant
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.SQLContext._
import org.elasticsearch.spark.sql._
object DF2ESTest {
def main(args: Array[String]): Unit = {
val conf = conf...
val sc = new SparkContext(conf)
val sqlContext: SQLContext = new SQLContext(sc)
import sqlContext.implicits._
val person = sc.textFile("F:\\mrdata\\person\\input\\person.txt")
.map(_.split(","))
.map(p => Person(p(0), p(0), p(1).trim.toInt)) //first write
// .map(p => Person(p(0),p(0))) //want to update name
.toDF()
person.saveToEs("person/person", Map("es.mapping.id" -> "id"))
}
}
case class Person(id: String, name: String, age: Int)
//case class Person(id:String,name: String)
first:write data to elasticsearch.
second: i want update only name to es,but it covers old data than age is disap

Related

Not able import spark SQL in Maven

I am trying to import Spark SQL. I am not able to import. I am not sure about the mistake what I am making. I am just a starting learner.
package MySource
import java.sql.{DriverManager, ResultSet}
import org.apache.spark.sql.SparkSession
import java.util.Properties
object MyCalc {
def main(args: Array[String]): Unit = {
println("This is my first Spark")
//val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val spark = SparkSession
.builder()
.appName("SparkSQL")
//.master("YARN")
.master("local[*]")
//.enableHiveSupport()
//.config("spark.sql.warehouse.dir","file:///c:/temp")
.getOrCreate()
import spark.sqlContext.implicits._
}
}
Error:(3, 8) object SparkSession is not a member of package org.apache.spark.sql
import org.apache.spark.sql.SparkSession
Error:(15, 17) not found: value SparkSession
val spark = SparkSession

Error java.io.NotSerializableException: org.apache.kafka.clients.producer.KafkaProducer

Connecting to spark streaming with external source like MS SQL server and publishing table data to Kafka.
Getting
java.io.NotSerializableException:org.apache.kafka.clients.producer.KafkaProducer
error.
Please find below deails.
**CustomReceiver.sacla**
package com.sparkdemo.app
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.receiver.Receiver
import java.util.List
import java.util.Map
import com.sparkdemo.entity.Inventory
import org.apache.kafka.clients.consumer.{ConsumerRecords, KafkaConsumer}
import java.net.ConnectException
import scala.util.{Try, Success, Failure}
import collection.JavaConversions._
class CustomReceiver(topic: String, kafkaParams: Map[String, Object]) extends Receiver[Inventory](StorageLevel.MEMORY_AND_DISK_2) {
override def onStart(): Unit = {
val dataService = new DataService()
var records: Inventory = dataService.selectAll()
new Thread("Socket Receiver") {
override def run() {
Try {
val consumer = new KafkaConsumer(kafkaParams)
consumer.subscribe(topic)
while (!isStopped && records!=null) {
// store(tokenData)
// tokenData = new DataService().selectAll();
val records = new DataService().selectAll();
store(records)
}
} match {
case e: ConnectException =>
restart("Error connecting to...", e)
case t: Throwable =>
restart("Error receiving data", t)
}
}
}.start()
}
override def onStop(): Unit = {
println("Nothing")
}
}
**DataService.scala**
package com.sparkdemo.app;
import java.sql.Connection
import java.sql.DriverManager
import java.sql.ResultSet
import java.sql.Statement
import java.util._
import scala.collection.JavaConversions._
import com.sparkdemo.entity.Inventory
class DataService {
var connect: Connection = DriverManager.getConnection(
"jdbc:sqlserver://localhost;databaseName=TestDB;user=SA;password=Sqlserver#007")
var statement: Statement = connect.createStatement()
var resultSet: ResultSet = null
var inv: Inventory = new Inventory()
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver")
def selectAll(): Inventory = {
resultSet = statement.executeQuery("select * from Inventory")
while (resultSet.next()) {
val inv: Inventory = new Inventory()
inv.setId(resultSet.getInt("id"))
inv.setName(resultSet.getString("name"))
inv.setQuantity(resultSet.getInt("quantity"))
}
inv
}
}
Scala main class **Stream.scala**
package com.sparkdemo.app
import org.apache.spark.streaming.dstream.DStream
import com.sparkdemo.config.Config
import org.apache.kafka.common.serialization.{ StringDeserializer, StringSerializer }
import org.apache.kafka.clients.producer.{ ProducerRecord, KafkaProducer }
import java.util.Properties
import collection.JavaConversions._
import com.sparkdemo.entity.Inventory
object Stream extends Serializable{
def main(args: Array[String]) {
import org.apache.spark.streaming._
def getKafkaParams: Map[String, Object] = {
Map[String, Object](
"auto.offset.reset" -> "earliest",
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "group3")
}
val properties = new Properties()
properties.put("bootstrap.servers", "localhost:9092")
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val topic1 = "topic1"
val topic2 = "topic2"
val producer :KafkaProducer[String, Object] = new KafkaProducer(properties)
val ssc = Config.configReceiver()
val stream = ssc.receiverStream(new CustomReceiver(topic1, getKafkaParams))
stream.map(Inventory=>producer.send(new ProducerRecord[String,Object](topic2,Inventory)))
stream.print()
ssc.start()
ssc.awaitTermination()
}
}
Entity class: **Inventory.scala**
package com.sparkdemo.entity
import scala.beans.{BeanProperty}
class Inventory extends Serializable{
#BeanProperty
var id: Int = _
#BeanProperty
var name: String = _
#BeanProperty
var quantity: Int = _
}
Error:
Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2287)
at org.apache.spark.streaming.dstream.DStream$$anonfun$map$1.apply(DStream.scala:547)
at org.apache.spark.streaming.dstream.DStream$$anonfun$map$1.apply(DStream.scala:547)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.SparkContext.withScope(SparkContext.scala:701)
at org.apache.spark.streaming.StreamingContext.withScope(StreamingContext.scala:265)
at org.apache.spark.streaming.dstream.DStream.map(DStream.scala:546)
at com.sparkdemo.app.Stream$.main(Stream.scala:36)
at com.sparkdemo.app.Stream.main(Stream.scala)
Caused by: java.io.NotSerializableException: org.apache.kafka.clients.producer.KafkaProducer
Serialization stack:
- object not serializable (class: org.apache.kafka.clients.producer.KafkaProducer, value: org.apache.kafka.clients.producer.KafkaProducer#557286ad)
- field (class: com.sparkdemo.app.Stream$$anonfun$main$1, name: producer$1, type: class org.apache.kafka.clients.producer.KafkaProducer)
- object (class com.sparkdemo.app.Stream$$anonfun$main$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:295)
... 12 more
You have ran issue where kafkaproducer is sent unintentionally to executor because of below code
stream.map(Inventory=>producer.send(new ProducerRecordString,Object))
You can mappartitions and create producer in mappartitions so that it is not shipped to executors.
The problem is the type of Serializer you are using for Object type value.
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
Please write a Serializer to read the Object type values.You can refer below link
Send Custom Java Objects to Kafka Topic

How To save DataFrame Into Cassandra table using Spark Java API

I want to save data frame into cassandra table using sparkJava API
I want to add the part of saving in the following code
I want to save people dataframe into cassandra table and make queries on that cassandra table
import org.apache.spark.api.java.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import com.datastax.spark.connector.cql.CassandraConnector;
import com.datastax.spark.connector.japi.CassandraRow;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
public class SimpleApp {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Simple Application");
conf.setMaster("local");
conf.set("spark.cassandra.connection.host", "localhost");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
DataFrame people = sqlContext.read().json("/root/people.json");
people.printSchema();
people.registerTempTable("people");
**//I want to save this TempTable or people dataframe into cassandra table and make teenagers SQL query on that cassandra table**
DataFrame teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
teenagers.show();
}
}

How to save Dataset<Row> in mySQL in spark?

I am using spark standalone cluster in my scenario. I want to read read a JSON file from Azure data lake and using SparkSQL and do some query over it and save the result into a mysql database. I don't know how to do it. A small help will be a great.
package com.biz.Read_from_ADL;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class App {
public static void main(String[] args) throws Exception {
SparkSession spark = SparkSession.builder().appName("Java Spark SQL basic example").getOrCreate();
Dataset<Row> df = spark.read().json("adl://pare.azuredatalakestore.net/EXCHANGE_DATA/BITFINEX/ETHBTC/MIDPOINT/BITFINEX_ETHBTC_MIDPOINT_2017-06-25.json");
//df.show();
df.createOrReplaceTempView("trade");
Dataset<Row> sqlDF = spark.sql("SELECT * FROM trade");
sqlDF.show();
}
}
You need to first define the connection properties and jdbc url.
import java.util.Properties
val connectionProperties = new Properties()
connectionProperties.put("user", "USER_NAME")
connectionProperties.put("password", "PASSWORD")
val jdbc_url = ... // <- use mysql url
import org.apache.spark.sql.SaveMode
spark.sql("select * from diamonds limit 10").withColumnRenamed("table", "table_number")
.write
.mode(SaveMode.Append) // <--- Append to the existing table
.jdbc(jdbc_url, "diamonds_mysql", connectionProperties)
Refer here for more detail.

how to convert directstream from kafka into data frames in spark 1.3.0

After creating a direct stream like below:
val events = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc, kafkaParams, topicsSet)
I would like to convert the above stream into data frames, so that I could run hive queries over it. Could anyone please explain how this can be achieved? I am using spark version 1.3.0
As explained in the Spark Streaming programming guide, try this:
import org.apache.spark.sql.SQLContext
object SQLContextSingleton {
#transient private var instance: SQLContext = null
// Instantiate SQLContext on demand
def getInstance(sparkContext: SparkContext): SQLContext = synchronized {
if (instance == null) {
instance = new SQLContext(sparkContext)
}
instance
}
}
case class Row(key: String, value: String)
eventss.foreachRDD { rdd =>
val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
import sqlContext.implicits._
val dataFrame = rdd.map {case (key, value) => Row(key, value)}.toDF()
dataFrame.show()
}

Resources