Spark and Drools integration (Reading rules from a drl file) - apache-spark

I am working on a spark program that takes input from the RDD and runs a few drool rules on it reading from a drl file.
in the drl file i have made a rule that wherever the hz attribute of the object is 0 it should increment the counter attribute by 1.
I have no clue why is that not working, it gives me an output of 0 for all the data in the stream (Yes, there is data with hz attribute equal to 0 and yes, I can print all the attributes and verify that even for them counter is 0)
I am using the KieSessionFactory class that I found on a github project here https://github.com/mganta/sprue/blob/master/src/main/java/com/cloudera/sprue/KieSessionFactory.java
But I am quite sure that this part not where the problem is, it only reads from the drl file and applies the rules.
below is my scala code: (I have marked the part where I think the problem lies, but please take a look at the drl file first)
package com.streams.Scala_Consumer
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.SparkContext._
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.{ DStream, InputDStream, ConstantInputDStream }
import org.apache.spark.streaming.kafka.v09.KafkaUtils
import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.sql.functions.avg
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.kafka.producer._
import org.apache.kafka.common.serialization.{ Deserializer, Serializer }
import org.apache.kafka.common.serialization.StringSerializer
import org.kie.api.runtime.StatelessKieSession
//import KieSessionFactory.getKieSession;
//import Sensor
object scala_consumer extends Serializable {
// schema for sensor data
class Sensor(resid_1: String, date_1: String, time_1: String, hz_1: Double, disp_1: Double, flo_1: Double, sedPPM_1: Double, psi_1: Double, chlPPM_1: Double, counter_1: Int) extends Serializable
{
var resid = resid_1
var date = date_1
var time = time_1
var hz = hz_1
var disp = disp_1
var flo = flo_1
var sedPPM = sedPPM_1
var psi = psi_1
var chlPPM = chlPPM_1
var counter = counter_1
def IncrementCounter (param: Int) =
{
counter = counter + param
}
}
// function to parse line of sensor data into Sensor class
def parseSensor(str: String): Sensor = {
val p = str.split(",")
//println("printing p: " + p)
new Sensor(p(0), p(1), p(2), p(3).toDouble, p(4).toDouble, p(5).toDouble, p(6).toDouble, p(7).toDouble, p(8).toDouble, 0)
}
var counter = 0
val timeout = 10 // Terminate after N seconds
val batchSeconds = 2 // Size of batch intervals
def main(args: Array[String]): Unit = {
val brokers = "maprdemo:9092" // not needed for MapR Streams, needed for Kafka
val groupId = "testgroup"
val offsetReset = "latest"
val batchInterval = "2"
val pollTimeout = "1000"
val topics = "/user/vipulrajan/streaming/original:sensor"
val topica = "/user/vipulrajan/streaming/fail:test"
val xlsFileName = "./src/main/Rules.drl"
val sparkConf = new SparkConf().setAppName("SensorStream").setMaster("local[1]").set("spark.testing.memory", "536870912")
.set("spark.streaming.backpressure.enabled", "true")
.set("spark.streaming.receiver.maxRate", Integer.toString(2000000))
.set("spark.streaming.kafka.maxRatePerPartition", Integer.toString(2000000));
val ssc = new StreamingContext(sparkConf, Seconds(batchInterval.toInt))
// Create direct kafka stream with brokers and topics
val topicsSet = topics.split(",").toSet
val kafkaParams = Map[String, String](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
"org.apache.kafka.common.serialization.StringDeserializer",
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
"org.apache.kafka.common.serialization.StringDeserializer",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> offsetReset,
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false",
"spark.kafka.poll.time" -> pollTimeout
)
val producerConf = new ProducerConf(
bootstrapServers = brokers.split(",").toList
)
val messages = KafkaUtils.createDirectStream[String, String](ssc, kafkaParams, topicsSet)
val values: DStream[String] = messages.map(_._2)
println("message values received")
//values.print(10)
///////////*************************PART THAT COULD BE CAUSING A PROBLEM**************************/////////////
values.foreachRDD(x => try{
print("did 1\n") //markers for manual and minor debugging
val myData = x.mapPartitions(s => {s.map(sens => {parseSensor(sens)})})
//myData.collect().foreach(println)
//println(youData.date)
print("did 2\n")
val evalData = myData.mapPartitions(s => {
val ksession = KieSessionFactory.getKieSession(xlsFileName)
val retData = s.map(sens => {ksession.execute(sens); sens;})
retData
})
evalData.foreach(t => {println(t.counter)})
print("did 3\n")
}
catch{case e1: ArrayIndexOutOfBoundsException => println("exception in line " )})
///////////*************************PART THAT COULD BE CAUSING A PROBLEM**************************/////////////
println("filtered alert messages ")
// Start the computation
ssc.start()
// Wait for the computation to terminate
ssc.awaitTermination()
}
}
the drl file
package droolsexample
import com.streams.Scala_Consumer.Sensor;
import scala.com.streams.Scala_Consumer.Sensor; //imported because my rules file lies in the src/main folder
//and code lies in src/main/scala
// declare any global variables here
dialect "java"
rule "Counter Incrementer"
when
sens : Sensor (hz == 0)
then
sens.IncrementCounter(1);
end
I have tried using an xls file instead of the drl file, I have tried creating the class in java and the object in scala. I have tried a lot of other things, but all I get in the output is a warning:
6/06/27 16:38:30.462 Executor task launch worker-0 WARN AbstractKieModule: No files found for KieBase defaultKieBase
and when I print the counter values I get all zeroes. Anybody to the rescue?

When you are doing the spark submit and passing your JAR for execution, pls ensure that other dependency JARs from KIE, etc are also included with in the same JAR and then run it with Spark-Submit.
alternate is to have two separate projects one where you have your spark program another is your KIE project so you will have two Jars and you run it something like below:
nohup spark-submit --conf "spark.driver.extraJavaOptions -Dlog4j.configuration=file:/log4j.properties" \
--queue abc \
--master yarn \
--deploy-mode cluster \
--jars drools-kie-project-0.0.1-SNAPSHOT.jar --class com.abc.DroolsSparkJob SparkcallingDrools-0.0.1-SNAPSHOT.jar \
-inputfile /user/hive/warehouse/abc/* -output /user/hive/warehouse/drools-Op > app.log &

Related

Neo4j thinks that password is database

I am trying to integrate Spark and Neo4j. I am new to Neo4j. I have the following short Spark app
import com.typesafe.config._
import org.apache.spark.sql.SparkSession
import org.neo4j.spark._
object Neo4jStorer {
var conf :Config = null
def main(args: Array[String]): Unit = {
val spark = getSparkSession()
val sc = spark.sparkContext
val g = Neo4jGraph.loadGraph(sc, label1="a", relTypes=Seq("rel"), label2 = "b")
val vCount = g.toString
println("Count= " + vCount)
}
def getSparkSession(): SparkSession = {
SparkSession
.builder
.appName("SparkNeo4j")
.config("spark.neo4j.bolt.url", "neo4j://127.0.0.1:7687")
.config("spark.neo4j.bolt.user", "neo4j")
.config("spark.neo4j.bolt.password", "FakePassword")
.getOrCreate()
}
}
I used https://neo4j.com/blog/neo4j-3-0-apache-spark-connector/ as an example for this code as I am using Spark 3.0. When I run this I get the following
20/10/17 14:36:36 ERROR LoadBalancer: Failed to update routing table for database 'FakePassword'. Current routing table: Ttl 1602963396190, currentTime 1602963396527, routers AddressSet=[], writers AddressSet=[], readers AddressSet=[], database 'FakePassword'.
org.neo4j.driver.exceptions.FatalDiscoveryException: Unable to get a routing table for database 'FakePassword' because this database does not exist
If I change the password I get an authentication error and I see that again the incorrect password is shown as being a database. I created a database with the name FakePassword and I still got the same error. Why is this happening and how can I fix it?
Also when I tried to get g.vertices.count as is shown in the example I am following I get a compilation error.
With this code I am able to get data from a DataFrame into Neo4j, which is what I really wanted to do. This does not seem to be the ideal solution as it uses foreach. I am open to improvements.
import com.typesafe.config._
import org.apache.spark.sql.SparkSession
import org.neo4j.driver.{AuthTokens, GraphDatabase, Session}
import org.neo4j.spark._
object StackoverflowAnswer {
def main(args: Array[String]): Unit = {
val spark = getSparkSession()
val sc = spark.sparkContext
import spark.implicits._
val df = sc.parallelize(List(1, 2, 3)).toDF
df.foreach(
row => {
val query = "CREATE (n:NumLable {num: " + row.get(0).toString +"})"
Neo4jSess.session.run(query)
()
}
)
}
def getSparkSession(): SparkSession = {
SparkSession
.builder
.appName("SparkNeo4j")
.getOrCreate()
}
}
object Neo4jSess {
/**
* Store a Neo4j session in a object so that it can be used by Spark
*/
var conf :Config = null
this.conf = ConfigFactory.load().getConfig("DeltaStorer")
val neo4jUrl: String = "bolt://127.0.0.1:7687"
val neo4jUser: String = "neo4j"
val neo4jPassword: String = "FakePassword"
val driver = GraphDatabase.driver(neo4jUrl, AuthTokens.basic(neo4jUser, neo4jPassword))
val session: Session = driver.session()
}
Please try to update spark-defaults.conf:
spark.jars.packages neo4j-contrib:neo4j-spark-connector:2.4.5-M2
spark.neo4j.url bolt://XX.XXX.X.XXX:7687
spark.neo4j.user neo4j
spark.neo4j.password test

Serialization of transform function in checkpointing

I'm trying to understand Spark Streaming's RDD transformations and checkpointing in the context of serialization. Consider the following example Spark Streaming app:
private val helperObject = HelperObject()
private def createStreamingContext(): StreamingContext = {
val conf = new SparkConf()
.setAppName(Constants.SparkAppName)
.setIfMissing("spark.master", Constants.SparkMasterDefault)
implicit val streamingContext = new StreamingContext(
new SparkContext(conf),
Seconds(Constants.SparkStreamingBatchSizeDefault))
val myStream = StreamUtils.createStream()
myStream.transform(transformTest(_)).print()
streamingContext
}
def transformTest(rdd: RDD[String]): RDD[String] = {
rdd.map(str => helperObject.doSomething(str))
}
val ssc = StreamingContext.getOrCreate(Settings.progressDir,
createStreamingContext)
ssc.start()
while (true) {
helperObject.setData(...)
}
From what I've read in other SO posts, transformTest will be invoked on the driver program once for every batch after streaming starts. Assuming createStreamingContext is invoked (no checkpoint is available), I would expect that the instance of helperObject defined up top would be serialized out to workers once per batch, hence picking up the changes applied to it via helperObject.setData(...). Is this the case?
Now, if createStreamingContext is not invoked (a checkpoint is available), then I would expect that the instance of helperObject cannot possibly be picked up for each batch, since it can't have been captured if createStreamingContext is not executed. Spark Streaming must have serialized helperObject as part of the checkpoint, correct?
Is it possible to update helperObject throughout execution from the driver program when using checkpointing? If so, what's the best approach?
If helperObject is going to be serialized to each executors?
Ans: Yes.
val helperObject = Instantiate_SomeHow()
rdd.map{_.SomeFunctionUsing(helperObject)}
Spark Streaming must have serialized helperObject as part of the checkpoint, correct?
Ans Yes.
If you wish to refresh your helperObject behaviour for each RDD operation you can still do that by making your helperObject more intelligent and not sending the helperObject directly but via a function which has the following signature () => helperObject_Class.
Since it is a function it is serializable. It is a very common design pattern used for sending objects that are not serializable e.g. database connection object or for your fun use case.
An example is given from Kafka Exactly once semantics using database
package example
import kafka.serializer.StringDecoder
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import scalikejdbc._
import com.typesafe.config.ConfigFactory
import org.apache.spark.{SparkContext, SparkConf, TaskContext}
import org.apache.spark.SparkContext._
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{KafkaUtils, HasOffsetRanges, OffsetRange}
/** exactly-once semantics from kafka, by storing offsets in the same transaction as the results
Offsets and results will be stored per-batch, on the driver
*/
object TransactionalPerBatch {
def main(args: Array[String]): Unit = {
val conf = ConfigFactory.load
val kafkaParams = Map(
"metadata.broker.list" -> conf.getString("kafka.brokers")
)
val jdbcDriver = conf.getString("jdbc.driver")
val jdbcUrl = conf.getString("jdbc.url")
val jdbcUser = conf.getString("jdbc.user")
val jdbcPassword = conf.getString("jdbc.password")
val ssc = setupSsc(kafkaParams, jdbcDriver, jdbcUrl, jdbcUser, jdbcPassword)()
ssc.start()
ssc.awaitTermination()
}
def setupSsc(
kafkaParams: Map[String, String],
jdbcDriver: String,
jdbcUrl: String,
jdbcUser: String,
jdbcPassword: String
)(): StreamingContext = {
val ssc = new StreamingContext(new SparkConf, Seconds(60))
SetupJdbc(jdbcDriver, jdbcUrl, jdbcUser, jdbcPassword)
// begin from the the offsets committed to the database
val fromOffsets = DB.readOnly { implicit session =>
sql"select topic, part, off from txn_offsets".
map { resultSet =>
TopicAndPartition(resultSet.string(1), resultSet.int(2)) -> resultSet.long(3)
}.list.apply().toMap
}
val stream: InputDStream[(String,Long)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, Long)](
ssc, kafkaParams, fromOffsets,
// we're just going to count messages per topic, don't care about the contents, so convert each message to (topic, 1)
(mmd: MessageAndMetadata[String, String]) => (mmd.topic, 1L))
stream.foreachRDD { rdd =>
// Note this block is running on the driver
// Cast the rdd to an interface that lets us get an array of OffsetRange
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// simplest possible "metric", namely a count of messages per topic
// Notice the aggregation is done using spark methods, and results collected back to driver
val results = rdd.reduceByKey {
// This is the only block of code running on the executors.
// reduceByKey did a shuffle, but that's fine, we're not relying on anything special about partitioning here
_+_
}.collect
// Back to running on the driver
// localTx is transactional, if metric update or offset update fails, neither will be committed
DB.localTx { implicit session =>
// store metric results
results.foreach { pair =>
val (topic, metric) = pair
val metricRows = sql"""
update txn_data set metric = metric + ${metric}
where topic = ${topic}
""".update.apply()
if (metricRows != 1) {
throw new Exception(s"""
Got $metricRows rows affected instead of 1 when attempting to update metrics for $topic
""")
}
}
// store offsets
offsetRanges.foreach { osr =>
val offsetRows = sql"""
update txn_offsets set off = ${osr.untilOffset}
where topic = ${osr.topic} and part = ${osr.partition} and off = ${osr.fromOffset}
""".update.apply()
if (offsetRows != 1) {
throw new Exception(s"""
Got $offsetRows rows affected instead of 1 when attempting to update offsets for
${osr.topic} ${osr.partition} ${osr.fromOffset} -> ${osr.untilOffset}
Was a partition repeated after a worker failure?
""")
}
}
}
}
ssc
}
}

spark streaming: use of broadcast variable generates NotSerializableException

I'm doing some tests in spark-shell after having loaded a jar with the Twitter utilities. Here is a code sequence that works:
// launch:
// spark-shell --driver-memory 1g --master local[3] --jars target/scala-2.10/tweetProcessing-1.0.jar
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.SparkConf
import org.apache.spark.streaming._
import org.apache.spark.SparkContext._
import org.apache.spark.streaming.twitter._
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.StreamingContext._
val consumerKey = ...
val consumerSecret = ...
val accessToken = ...
val accessTokenSecret = ...
System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
System.setProperty("twitter4j.oauth.accessToken", accessToken)
System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
val ssc = new StreamingContext(sc, Seconds(60))
val tweetStream = TwitterUtils.createStream(ssc, None)
val myNewStream = tweetStream.map(tweet => tweet.getText)
.map(tweetText => tweetText.toLowerCase.split("\\W+"))
.transform(rdd =>
rdd.map(tweetWordSeq => {
tweetWordSeq.foreach { word => {
val mySet = Set("apple", "orange");
if(!(mySet)(word)) word }
}
}))
myNewStream.foreachRDD((rdd,time) => {
println("%s at time %s".format(rdd.count(),time.milliseconds))
})
ssc.start()
(actually I reduced to a maximum the computation I make, just to highlight the problem). Here the mySet is serialized and everything goes well.
But when I'm using instead a broadcast variable and replace the test accordingly:
val ssc = new StreamingContext(sc, Seconds(60))
val mySet = sc.broadcast(Set("apple", "orange"))
val tweetStream = TwitterUtils.createStream(ssc, None)
val myNewStream = tweetStream.map(tweet => tweet.getText)
.map(tweetText => tweetText.toLowerCase.split("\\W+"))
.transform(rdd =>
rdd.map(tweetWordSeq => {
tweetWordSeq.foreach { word => {
if(!(mySet.value)(word)) word }
}
}))
myNewStream.foreachRDD((rdd,time) => {
println("%s at time %s".format(rdd.count(),time.milliseconds))
})
ssc.start()
I get:
ERROR JobScheduler: Error generating jobs for time 1464335160000 ms
org.apache.spark.SparkException: Task not serializable
...
Caused by: java.io.NotSerializableException: Object of org.apache.spark.streaming.dstream.TransformedDStream is being serialized possibly as a part of closure of an RDD operation. This is because the DStream object is being referred to from within the closure. Please rewrite the RDD operation inside this DStream to avoid this. This has been enforced to avoid bloating of Spark tasks with unnecessary objects.
I would naturally prefer to use broadcast variables (my set is actually a rather large set of stop words) but I don't quite see where the problem comes from.
You need to create the broadcast variable in the driver, (outside of any closures) not within any transformation like transform, foreachRDD etc.
val ssc = new StreamingContext(sc, Seconds(60))
val mySet = ssc.sparkContext.broadcast(Set("apple", "orange"))
Then, you can access the broadcast variable within the transform or other DStream closures on the executors like,
!(mySet.value)(word)
If you have this statement sc.broadcast(Set("apple", "orange")) within the rdd.map of the transform closure, driver will try to send the StreamingContext over to all executors and it is not serializable. That's why you are seeing NotSerializableException

How to work with real time streaming data/logs using spark streaming?

I am newbie to Spark and Scala.
I want to implement a REAL TIME Spark Consumer which could read the network logs on per minute basis [fetching around 1GB of JSON log lines/minute] from Kafka Publisher and finally store the aggregated values in ElasticSearch.
Aggregations is based on few values [like bytes_in, bytes_out etc] using composite key [like : client MAC, client IP, server MAC, Server IP etc].
Spark Consumer which I have written is:
object LogsAnalyzerScalaCS{
def main(args : Array[String]) {
val sparkConf = new SparkConf().setAppName("LOGS-AGGREGATION")
sparkConf.set("es.nodes", "my ip address")
sparkConf.set("es.port", "9200")
sparkConf.set("es.index.auto.create", "true")
sparkConf.set("es.nodes.discovery", "false")
val elasticResource = "conrec_1min/1minute"
val ssc = new StreamingContext(sparkConf, Seconds(30))
val zkQuorum = "my zk quorum IPs:2181"
val consumerGroupId = "LogsConsumer"
val topics = "Logs"
val topicMap = topics.split(",").map((_,3)).toMap
val json = KafkaUtils.createStream(ssc, zkQuorum, consumerGroupId, topicMap)
val logJSON = json.map(_._2)
try{
logJSON.foreachRDD( rdd =>{
if(!rdd.isEmpty()){
val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
import sqlContext.implicits._
val df = sqlContext.read.json(rdd)
val groupedData =
((df.groupBy("id","start_time_formated","l2_c","l3_c",
"l4_c","l2_s","l3_s","l4_s")).agg(count("f_id") as "total_f", sum("p_out") as "total_p_out",sum("p_in") as "total_p_in",sum("b_out") as "total_b_out",sum("b_in") as "total_b_in", sum("duration") as "total_duration"))
val dataForES = groupedData.withColumnRenamed("start_time_formated", "start_time")
dataForES.saveToEs(elasticResource)
dataForES.show();
}
})
}
catch{
case e: Exception => print("Exception has occurred : "+e.getMessage)
}
ssc.start()
ssc.awaitTermination()
}
object SQLContextSingleton {
#transient private var instance: org.apache.spark.sql.SQLContext = _
def getInstance(sparkContext: SparkContext): org.apache.spark.sql.SQLContext = {
if (instance == null) {
instance = new org.apache.spark.sql.SQLContext(sparkContext)
}
instance
}
}
}
First of all I would like to know if at all my approach is correct or not [considering I need 1 min logs aggregation]?
There seems to be an issue using this code:
This Consumer will pull data from the Kafka broker every 30 seconds
and saving the final aggregation to Elasticsearch for that 30
sec data, hence increasing the number of rows in Elasticsearch for
unique key [at least 2 entries per one minute]. UI tool [
let's say Kibana] needs to do further aggregation. If I increase the
polling time from 30 sec to 60 sec then it takes a lot of time to
aggregate and hence not at all remains real time.
I want to implement it in such a way that in ElasticSearch only one
row per key should get saved. Hence I want to perform aggregation
till the time I am not getting new key values in my dataset which is
getting pulled from Kafka broker [per minute basis]. After doing
some googling I have found that this could be achieved using
groupByKey() and updateStateByKey() functions but I am not able to
make out how I could use this in my case [should I convert the JSON
Log lines into a string of log line with flat values and then use
these functions there]? If I will use these functions then when
should I save the final aggregated values into ElasticSearch?
Is there any other way of achieving it?
Your quick help will be appreciated.
Regards,
Bhupesh
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object Main {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("KafkaWordCount").setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(15))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "group1",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)//,localhost:9094,localhost:9095"
val topics = Array("test")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
val out = stream.map(record =>
record.value
)
val words = out.flatMap(_.split(" "))
val count = words.map(word => (word, 1))
val wdc = count.reduceByKey(_+_)
val sqlContext = SQLContext.getOrCreate(SparkContext.getOrCreate())
wdc.foreachRDD{rdd=>
val es = sqlContext.createDataFrame(rdd).toDF("word","count")
import org.elasticsearch.spark.sql._
es.saveToEs("wordcount/testing")
es.show()
}
ssc.start()
ssc.awaitTermination()
}
}
To see full example and sbt
apache-sparkscalahadoopkafkaapache-spark-sql spark-streamingapache-spark-2.0elastic

Spark Streaming not detecting new HDFS files

I am running the program below on Spark 1.3.1. Spark Streaming is watching a directory in HDFS for new files and should process them as they come in. I have read that the best way to do this is to move the files from an existing HDFS location so that the operation is atomic.
I start my streaming job, I add a bunch of small files to a random HDFS directory, then I move these files from the original HDFS directory to the watched HDFS directory (all with simple shell commands). But my streaming job is not recognizing these as new files and therefore not processing them.
Currently I am using textFileStream but am open to using fileStream. However I am getting errors with this val lines = ssc.fileStream[LongWritable, Text, TextInputFormat]("hdfs:///name/spark-streaming/data/", (p: Path)=>true, false)
package com.com.spark.prototype
import java.io.FileInputStream
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark._
import org.apache.spark.streaming._
import com.twitter.algebird.HyperLogLogMonoid
import org.apache.hadoop.io._
object HLLStreamingHDFSTest {
def functionToCreateContext(): StreamingContext = {
val conf = new SparkConf().set("spark.executor.extraClassPath", "/home/hadoop/spark/conf:/home/hadoop/conf:/home/hadoop/spark/classpath/emr/*:/home/hadoop/spark/classpath/emrfs/*:/home/hadoop/share/hadoop/common/lib/*:/home/hadoop/share/hadoop/common/lib/hadoop-lzo.jar")
val ssc = new StreamingContext(conf, Seconds(5))
ssc.checkpoint("/name/spark-streaming/checkpointing")
val lines = ssc.textFileStream("hdfs:///name/spark-streaming/data/")
val hll = new HyperLogLogMonoid(15)
var globalHll = hll.zero
val users = lines.map(_.toString().toCharArray.map(_.toByte))
val approxUsers = users.mapPartitions(ids => {
ids.map(id => hll(id))
}).reduce(_ + _)
approxUsers.foreachRDD(rdd => {
if (rdd.count() != 0) {
val partial = rdd.first()
globalHll += partial
println()
println()
println("Estimated distinct users this batch: %d".format(partial.estimatedSize.toInt))
println("Estimated distinct users this batch: %d".format(globalHll.estimatedSize.toInt))
println()
println("Approx distinct users this batch: %s".format(partial.approximateSize.toString))
println("Approx distinct users overall: %s".format(globalHll.approximateSize.toString))
}
})
ssc
}
def main(args: Array[String]): Unit = {
val context = StreamingContext.getOrCreate("hdfs:///name/spark-streaming/checkpointing", functionToCreateContext _)
context.start()
context.awaitTermination()
}
}

Resources