In Addition to Not Serializable exception when integrating Spark SQL and Spark Streaming
My source code
public static void main(String args[]) {
SparkConf sparkConf = new SparkConf().setAppName("NumberCount");
JavaSparkContext jc = new JavaSparkContext(sparkConf);
JavaStreamingContext jssc = new JavaStreamingContext(jc, new Duration(2000));
jssc.addStreamingListener(new WorkCountMonitor());
int numThreads = Integer.parseInt(args[3]);
Map<String,Integer> topicMap = new HashMap<String,Integer>();
String[] topics = args[2].split(",");
for (String topic : topics) {
topicMap.put(topic, numThreads);
}
JavaPairReceiverInputDStream<String,String> data = KafkaUtils.createStream(jssc, args[0], args[1], topicMap);
data.print();
JavaDStream<Person> streamData = data.map(new Function<Tuple2<String, String>, Person>() {
public Person call(Tuple2<String,String> v1) throws Exception {
String[] stringArray = v1._2.split(",");
Person Person = new Person();
Person.setName(stringArray[0]);
Person.setAge(stringArray[1]);
return Person;
}
});
final JavaSQLContext sqlContext = new JavaSQLContext(jc);
streamData.foreachRDD(new Function<JavaRDD<Person>,Void>() {
public Void call(JavaRDD<Person> rdd) {
JavaSchemaRDD subscriberSchema = sqlContext.applySchema(rdd, Person.class);
subscriberSchema.registerAsTable("people");
System.out.println("all data");
JavaSchemaRDD names = sqlContext.sql("SELECT name FROM people");
System.out.println("afterwards");
List<String> males = new ArrayList<String>();
males = names.map(new Function<Row,String>() {
public String call(Row row) {
return row.getString(0);
}
}).collect();
System.out.println("before for");
for (String name : males) {
System.out.println(name);
}
return null;
}
});
jssc.start();
jssc.awaitTermination();
}
JavaSQLContext is also declared outside the ForeachRDD loop but i am still getting NonSerializableException
14/12/23 23:49:38 ERROR JobScheduler: Error running job streaming job 1419378578000 ms.1
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:166)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:158)
at org.apache.spark.SparkContext.clean(SparkContext.scala:1435)
at org.apache.spark.rdd.RDD.map(RDD.scala:271)
at org.apache.spark.api.java.JavaRDDLike$class.map(JavaRDDLike.scala:78)
at org.apache.spark.sql.api.java.JavaSchemaRDD.map(JavaSchemaRDD.scala:42)
at com.basic.spark.NumberCount$2.call(NumberCount.java:79)
at com.basic.spark.NumberCount$2.call(NumberCount.java:67)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:274)
at org.apache.spark.streaming.api.java.JavaDStreamLike$$anonfun$foreachRDD$1.apply(JavaDStreamLike.scala:274)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply(DStream.scala:529)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply(DStream.scala:529)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:42)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:32)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:171)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:724)
Caused by: java.io.NotSerializableException: org.apache.spark.sql.api.java.JavaSQLContext
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1181)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1541)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1506)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1429)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1175)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1541)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1506)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1429)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1175)
at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1541)
at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1506)
at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1429)
at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1175)
at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:42)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:73)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:164)
... 20 more
I appreciate if you have any suggestion.
Have you implemented Serializable interface in Person pojo class.Also can you try declaring topicMap as final
here is the working code
package com.basic.spark;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.api.java.JavaSQLContext;
import org.apache.spark.sql.api.java.JavaSchemaRDD;
import org.apache.spark.sql.api.java.Row;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import scala.Tuple2;
public class NumberCount implements Serializable {
transient SparkConf sparkConf = new SparkConf().setAppName("NumberCount");
transient JavaSparkContext jc = new JavaSparkContext(sparkConf);
transient JavaStreamingContext jssc_1 = new JavaStreamingContext(jc, new Duration(1000));
transient JavaSQLContext sqlContext = new JavaSQLContext(jc);
transient Producer producer = configureKafka();
public static void main(String args[]) {
(new NumberCount()).job_1(args);
}
public void job_1(String...args) {
jssc_1.addStreamingListener(new WorkCountMonitor());
int numThreads = Integer.parseInt(args[3]);
Map<String,Integer> topicMap = new HashMap<String,Integer>();
String[] topics = args[2].split(",");
for (String topic : topics) {
topicMap.put(topic, numThreads);
}
JavaPairReceiverInputDStream<String,String> data = KafkaUtils.createStream(jssc_1, args[0], args[1], topicMap);
data.window(new Duration(10000), new Duration(2000));
JavaDStream<String> streamData = data.map(new Function<Tuple2<String, String>, String>() {
public String call(Tuple2<String,String> v1) {
return v1._2;
}
});
streamData.foreachRDD(new Function<JavaRDD<String>,Void>() {
public Void call(JavaRDD<String> rdd) {
if (rdd.count() < 1)
return null;
try {
JavaSchemaRDD eventSchema = sqlContext.jsonRDD(rdd);
eventSchema.registerTempTable("event");
System.out.println("all data");
JavaSchemaRDD names = sqlContext.sql("SELECT deviceId, count(*) FROM event group by deviceId");
System.out.println("afterwards");
// List<Long> males = new ArrayList<Long>();
//
// males = names.map(new Function<Row,Long>() {
// public Long call(Row row) {
// return row.getLong(0);
// }
// }).collect();
// System.out.println("before for");
// ArrayList<KeyedMessage<String, String>> data = new ArrayList<KeyedMessage<String, String>>();
// for (Long name : males) {
// System.out.println("**************"+name);
// writeToKafka_1(data, String.valueOf(name));
// }
// producer.send(data);
List<String> deviceDetails = new ArrayList<String>();
deviceDetails = names.map(new Function<Row,String>() {
public String call(Row row) {
return row.getString(0) +":" + row.getLong(1);
}
}).collect();
System.out.println("before for");
ArrayList<KeyedMessage<String, String>> data = new ArrayList<KeyedMessage<String, String>>();
for (String name : deviceDetails) {
System.out.println("**************"+name);
writeToKafka_1(data, name);
}
producer.send(data);
} catch (Exception e) {
System.out.println("#ERROR_1# #" + rdd);
e.printStackTrace();
}
return null;
}
});
jssc_1.start();
jssc_1.awaitTermination();
}
public Producer<String, String> configureKafka() {
Properties props = new Properties();
props.put("metadata.broker.list", "xx.xx.xx.xx:9092");
props.put("serializer.class", "kafka.serializer.StringEncoder");
props.put("compression.codec", "2");
props.put("request.required.acks", "0");
props.put("producer.type", "sync");
ProducerConfig config = new ProducerConfig(props);
Producer<String, String> producer = new Producer<String, String>(config);
return producer;
}
public void writeToKafka_1(ArrayList<KeyedMessage<String,String>> list, String msg) {
list.add(new KeyedMessage<String,String>("my-replicated-topic-1", "", msg));
}
}
Related
I have an activity that makes a connection with an php file. that php file needs an ean to work. so I need to send the ean code from my input field in my main activity to my second activity so I can send it to php but I could get it to work. this is what I have:
// Intent productInfo = new Intent(mainActivity.this, androidProductRequest.class);
// productInfo.putExtra("ean",ean.getText().toString());
// startActivity(productInfo);
androidProductRequest androidProductRequest = new androidProductRequest(Integer.parseInt(ean.getText().toString()));
RequestQueue queue = Volley.newRequestQueue(this);
queue.add(androidProductRequest);
at first I thought maybe it will work when I send it with intent but then I saw that u need to do an something with requestqueue but couldn't figure out what exactly. this is what I have on the second activity:
package com.example.productfinder;
import android.util.Log;
import com.android.volley.AuthFailureError;
import com.android.volley.Request;
import com.android.volley.RequestQueue;
import com.android.volley.Response;
import com.android.volley.VolleyError;
import com.android.volley.toolbox.StringRequest;
import com.android.volley.toolbox.Volley;
import org.json.JSONException;
import org.json.JSONObject;
import java.util.HashMap;
import java.util.Map;
public class androidProductRequest extends mainActivity {
public void androidProductRequest (final String ean){
try {
RequestQueue requestQueue = Volley.newRequestQueue(this);
String URL = "My_url";
JSONObject jsonBody = new JSONObject();
jsonBody.put("Title", "Android Volley Demo");
final String requestBody = jsonBody.toString();
StringRequest stringRequest = new StringRequest(Request.Method.POST, URL, new Response.Listener<String>() {
#Override
public void onResponse(String response) {
Log.i("VOLLEY1", response);
}
}, new Response.ErrorListener() {
#Override
public void onErrorResponse(VolleyError error) {
Log.e("VOLLEY", error.toString());
}
}) {
#Override
protected Map<String, String> getParams() throws AuthFailureError {
Map<String, String> params = new HashMap<>();
return params;
}
};
requestQueue.add(stringRequest);
} catch (JSONException e) {
e.printStackTrace();
}
}
}
I am trying to use wholeTextFiles API for file processing. I do have lot of .gz files in a folder and want to read them with the wholeTextFiles API.
I have 4 executors with each 1 core with 2GB RAM on each executor.
Only 2 executors are processing the job and the processing is really slow. The other two executors are sitting idle.
How do i spread the job to other 2 executors to increase the parallelism.?
package com.sss.ss.ss.WholeText;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
public class WholeText {
public static class mySchema implements Serializable {
private String CFIELD1 ;
private String CFIELD2 ;
public String getCFIELD1()
{
return CFIELD1;
}
public void setCFIELD1(String cFIELD1)
{
CFIELD1 = cFIELD1;
}
public String getCFIELD2()
{
return CFIELD2;
}
public void setCFIELD2(String cFIELD2)
{
CFIELD2 = cFIELD2;
}
}
public static void main(String[] args) throws InterruptedException {
SparkConf sparkConf = new SparkConf().setAppName("My app")
.setMaster("mymaster..")
.set("spark.driver.allowMultipleContexts", "true");
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(15));
JavaPairRDD<String, String> wholeTextFiles = jssc.sparkContext().wholeTextFiles(args[0],Integer.parseInt(args[3]));
Integer ll = wholeTextFiles.getNumPartitions();
System.out.println("Number of Partitions"+ll);
JavaRDD<String> stringRDD = wholeTextFiles.
map(
new Function<Tuple2<String, String>, String>() {
private static final long serialVersionUID = -551872585218963131L;
public String call(Tuple2<String, String> v1) throws Exception
{
return v1._2;
}
}
).
flatMap
(new FlatMapFunction<String, String>()
{
public Iterator<String> call(String t) throws Exception
{
return Arrays.asList(t.split("\\r?\\n")).iterator();
}
}).
filter(new Function<String, Boolean>() {
private static final long serialVersionUID = 1L;
public Boolean call(String t) throws Exception {
int colons = 0;
String s = t;
if(s == null || s.trim().length() < 1) {
return false;
}
for(int i = 0; i < s.length(); i++) {
if(s.charAt(i) == ';') colons++;
}
System.out.println("colons="+colons);
if ((colons <=3)){
return false;
}
return true;
}
});
JavaRDD<mySchema> schemaRDD = stringRDD.map(new Function<String, mySchema>()
{
private static final long serialVersionUID = 1L;
public mySchema call(String line) throws Exception
{
String[] parts = line.split(";",-1);
mySchema mySchema = new mySchema();
mySchema.setCFIELD1 (parts[0]);
mySchema.setCFIELD2 (parts[1]);
return mySchema;
}
});
SQLContext hc = new HiveContext(jssc.sparkContext());
Dataset<Row> df = hc.createDataFrame(schemaRDD, mySchema.class);
df.createOrReplaceTempView("myView");
hc.sql("INSERT INTO -----
"from myView");
hc.sql("INSERT INTO .......
"from myView");
}
}
I do have a multi node spark cluster and submitting my spark program on node where master resides.
When the job submitted to worker nodes, the HOSTNAME paramter is giving null value. Here is the line where properties are being read as null.
System.getenv(HOSTNAME) is not being read from worker node.
System.out.println("line 76 System.getenv(HOSTNAME)=" + System.getenv("HOSTNAME"));
AUDIT_USER, AUDIT_PASSWORD also null when being read(they both were on properties file).
If i submit job with one node i have no issues with these parameters. But, if u submit job on standalone mode with 6 nodes i am getting this issue.
I have created the same folder for properties file on all nodes.
Here is my code. could you please let me know why System.env is not giving null and my properties are null?
package com.fb.cpd.myapp;
import java.io.Serializable;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.Future;
import org.apache.commons.configuration.ConfigurationConverter;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.commons.configuration.reloading.FileChangedReloadingStrategy;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import kafka.common.TopicAndPartition;
import kafka.message.MessageAndMetadata;
import kafka.serializer.DefaultDecoder;
import kafka.serializer.StringDecoder;
public class GenericLogic implements Serializable {
/**
*
*/
private static final long serialVersionUID = 1L;
private static final Logger logger = LogManager.getLogger(GenericLogic.class);
private PropertiesConfiguration props;
private Producer<String, String> producer = null;
private Future<RecordMetadata> receipt = null;
private RecordMetadata receiptInfo = null;
private ConnectToRDBMS auditor = null;
private ConnectToRDBMS df = null;
private static String myId = null;
private Map<TopicAndPartition, Long> getOffsets(String topic) throws SQLException {
String appName = "myapp";
String TopicName = topic;
Map<TopicAndPartition, Long> topicMap = new HashMap<>(); //
System.out.println("line 64 before making connection");
try {
props = new PropertiesConfiguration("/app/lock/conf/empty.properties");
} catch (ConfigurationException e) { // TODO Auto-generated catch block
System.out.println("Line 70");
e.printStackTrace();
}
try {
System.out.println("line 76 System.getenv(HOSTNAME)=" + System.getenv("HOSTNAME"));
auditor = new ConnectToRDBMS(System.getenv("HOSTNAME"), "lockSparkCollector", null, null, null, null, null,
0, props.getString("AUDIT_USER"), props.getString("AUDIT_PASSWORD"),
props.getString("AUDIT_DB_URL"));
} catch (SQLException e) {
logger.error("ASSERT: run() ERROR CONNECTING TO AUDIT DB " + e.getMessage());
}
System.out.println("line 64 after making connection");
Statement stmt = null;
String query = "select va_application, topic_name, partition_id, from_offset,until_offset from lock_spark_offsets where va_application = "
+ "'" + appName + "'" + " and topic_name= " + "'" + TopicName + "'";
System.out.println("query" + query);
System.out.println("before query exection");
try {
stmt = auditor.dbConnection.createStatement();
System.out.println("line 81");
ResultSet rs = stmt.executeQuery(query);
System.out.println("line 83");
while (rs.next()) {
System.out.println("pass 1 of Resultset");
System.out.println("getOffsets=" + topic.trim() + " " + rs.getInt("partition_id") + " "
+ rs.getString("until_offset") + " " + rs.getString("until_offset"));
Integer partition = rs.getInt("partition_id");
TopicAndPartition tp = new TopicAndPartition(topic.trim(), partition);
System.out.println("102");
Long.parseLong(rs.getString("until_offset"));
topicMap.put(tp, Long.parseLong(rs.getString("until_offset")));
System.out.println("105");
}
System.out.println("after populating topic map");
} catch (
SQLException e) {
System.out.println("printing exception");
e.printStackTrace();
} finally {
if (stmt != null) {
System.out.println("closing statement");
stmt.close();
}
}
return topicMap;
}
public void setDefaultProperties() {
FileChangedReloadingStrategy strategy = new FileChangedReloadingStrategy();
strategy.setRefreshDelay(10000);
System.out.println("Line 45");
// supply the properties file.
try {
props = new PropertiesConfiguration("/app/lock/conf/empty.properties");
} catch (ConfigurationException e) {
// TODO Auto-generated catch block
System.out.println("Line 51");
e.printStackTrace();
}
props.setReloadingStrategy(strategy);
System.out.println("Line 56");
// Producer configs
if (!props.containsKey("acks")) {
props.setProperty("acks", "1");
}
if (!props.containsKey("retries")) {
props.setProperty("retries", "1000");
}
if (!props.containsKey("compression.type")) {
props.setProperty("compression.type", "gzip");
}
if (!props.containsKey("request.timeout.ms")) {
props.setProperty("request.timeout.ms", "600000");
}
if (!props.containsKey("batch.size")) {
props.setProperty("batch.size", "32768");
}
if (!props.containsKey("buffer.memory")) {
props.setProperty("buffer.memory", "134217728");
}
if (!props.containsKey("block.on.buffer.full")) {
props.setProperty("block.on.buffer.full", "true");
}
if (!props.containsKey("SHUTDOWN")) {
props.setProperty("SHUTDOWN", "false");
}
if (!props.containsKey("producer.topic")) {
props.setProperty("producer.topic", "mytopic1");
}
Properties producer_props = ConfigurationConverter.getProperties(props);
producer_props.setProperty("bootstrap.servers", props.getString("target.bootstrap.servers"));
producer_props.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
producer_props.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); // ????
this.producer = new KafkaProducer<String, String>(producer_props);
System.out.println("Line 107");
}
public void PublishMessages(String st) {
try {
System.out.println("Line 111");
String key = UUID.randomUUID().toString().replace("-", "");
System.out.println("Started Producing...");
receipt = producer.send(new ProducerRecord<String, String>(props.getString("producer.topic"), key, // Key
st));
System.out.println("After Completion of Producing Producing");
} catch (Exception e) {
e.printStackTrace();
System.out.println("Exception in PublishMessages ");
}
}
public void DBConnect() {
try {
auditor = new ConnectToRDBMS(System.getenv("HOSTNAME"), "myapp", props.getString("consumer.topic"), null,
null, null, null, 0, props.getString("AUDIT_USER"), props.getString("AUDIT_PASSWORD"),
props.getString("AUDIT_DB_URL"));
} catch (SQLException e) {
logger.error("ASSERT: run() ERROR CONNECTING TO AUDIT DB " + e.getMessage());
return;
}
}
private void writeToDB(Long startTime, Integer partnId, String fromOffset, String untilOffset, Integer count) {
this.auditor.audit(startTime, partnId, fromOffset, untilOffset, count);
}
/**
*
* #param jsc
* #param topicSet
* #throws Exception
*/
public static void main(String[] args) {
String topicNames = "MySourceTopic";
GenericLogic ec = new GenericLogic();
Map<TopicAndPartition, Long> topicMap = null;
try {
topicMap = ec.getOffsets("MySourceTopic");
} catch (SQLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
boolean clusterMode = false;
Integer batchDuration = Integer.parseInt("30000");
JavaSparkContext sparkConf = new JavaSparkContext("abcd.net:7077", "Kafka-Spark-Integration");
sparkConf.getConf().set("spark.local.ip", "lock-dt-a4d.xyz.com");
sparkConf.getConf().set("spark.eventLog.enabled", "false");
sparkConf.getConf().set("spark.shuffle.blockTransferService", "nio");
JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, new Duration(10000));
Map<String, String> kafkaParams = new HashMap<String, String>();
String pollInterval = "10000";
String zookeeper = "lock-dt-a5d.xyz.com:2181,lock-dt-a6d.xyz.com:2181";
kafkaParams.put("metadata.broker.list", "lock-dt-a5d.xyz.com:9092,lock-dt-a6d.xyz.com:9092");
kafkaParams.put("group.id", "Consumer");
kafkaParams.put("client.id", "Consumer");
kafkaParams.put("zookeeper.connect", zookeeper);
JavaInputDStream<byte[]> directKafkaStream = KafkaUtils.createDirectStream(jsc, String.class, byte[].class,
StringDecoder.class, DefaultDecoder.class, byte[].class, kafkaParams, topicMap,
(Function<MessageAndMetadata<String, byte[]>, byte[]>) MessageAndMetadata::message);
directKafkaStream.foreachRDD(rdd -> {
if (rdd.isEmpty()) {
System.out.println("No events polled in last " + pollInterval + " milli seconds");
return;
}
rdd.foreachPartition(itr -> {
Integer partnId = TaskContext.get().partitionId();
Long systime = System.nanoTime();
Map<String, String> hmap = new HashMap<String, String>();
GenericLogic ec2 = new GenericLogic();
ec2.setDefaultProperties();
ec2.DBConnect();
try {
while (itr.hasNext()) {
System.out.println("232");
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
});
});
jsc.start();
jsc.awaitTermination();
}
}
I started the salves with start-slaves.sh.
That's the issue. I have to start workers by supplying master addres.
Can you please let us know OS of all the nodes and if you have ensured that noting on Master node is exporting the HOSTNAME. Answering your question will be better if you let us know about your OS detail.
May not be correctly related to your context but just for information System.getenv("HOSTNAME") may not provide hostname in all the platforms (for example Ubuntu or Mac).
Better is why not export the HOSTNAME.
Note: I am assuming you have already checked that props is not null or empty?
If not debug and check whether properties files is loaded or not and if loaded it is not the empty properties file and hence it has loaded the properties from the file.
Looking at your problem (not only environment variable but properties are also not returning, there may be something wrong with properties file or its relative location on different computers.
If it is not exact copy which is placed at different computers, please also check if it is a file good for Linux (not written and edited in windows and put in linux).
SQlContext when accessed as below using a singleton class works fine in local mode, however when submitted spark master, it becomes null and throws nullpointer exceptions. How can this be fixed?
In our usecase FlatMapFunction is expected to query another DStream and the results returned are used to create a new stream.
Have extended the JavaStatefulNetworkWordCount example to print the changes to the state. I need to access the rdds from a stateful dstream in another dstream using sqlcontext in order to create another dstream. How can this be achieved?
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.spark.HashPartitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.StorageLevels;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import com.google.common.base.Optional;
import com.google.common.collect.Lists;
public class JavaStatefulNetworkWordCount {
private static final Pattern SPACE = Pattern.compile(" ");
public static void main(String[] args) {
if (args.length < 2) {
System.err.println("Usage: JavaStatefulNetworkWordCount <hostname> <port>");
System.exit(1);
}
// Update the cumulative count function
final Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction =
new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
#Override
public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
Integer newSum = state.or(0);
for (Integer value : values) {
newSum += value;
}
return Optional.of(newSum);
}
};
// Create the context with a 1 second batch size
SparkConf sparkConf = new SparkConf().setAppName("JavaStatefulNetworkWordCount");
// sparkConf.setMaster("local[5]");
// sparkConf.set("spark.executor.uri", "target/rkspark-0.0.1-SNAPSHOT.jar");
JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1));
ssc.checkpoint(".");
SQLContext sqlContext = JavaSQLContextSingleton.getInstance(ssc.sparkContext().sc());
// Initial RDD input to updateStateByKey
List<Tuple2<String, Integer>> tuples = Arrays.asList(new Tuple2<String, Integer>("hello", 1),
new Tuple2<String, Integer>("world", 1));
JavaPairRDD<String, Integer> initialRDD = ssc.sc().parallelizePairs(tuples);
JavaReceiverInputDStream<String> lines = ssc.socketTextStream(
args[0], Integer.parseInt(args[1]), StorageLevels.MEMORY_AND_DISK_SER_2);
JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
#Override
public Iterable<String> call(String x) {
return Lists.newArrayList(SPACE.split(x));
}
});
JavaPairDStream<String, Integer> wordsDstream = words.mapToPair(
new PairFunction<String, String, Integer>() {
#Override
public Tuple2<String, Integer> call(String s) {
return new Tuple2<String, Integer>(s, 1);
}
});
// This will give a Dstream made of state (which is the cumulative count of the words)
JavaPairDStream<String, Integer> stateDstream = wordsDstream.updateStateByKey(updateFunction,
new HashPartitioner(ssc.sparkContext().defaultParallelism()), initialRDD);
JavaDStream<WordCount> countStream = stateDstream.map(new Function<Tuple2<String, Integer>, WordCount>(){
#Override
public WordCount call(Tuple2<String, Integer> v1) throws Exception {
return new WordCount(v1._1,v1._2);
}});
countStream.foreachRDD(new Function<JavaRDD<WordCount>,Void>() {
#Override
public Void call(JavaRDD<WordCount> rdd) {
SQLContext sqlContext = JavaSQLContextSingleton.getInstance(rdd.context());
DataFrame wordsDataFrame = sqlContext.createDataFrame(rdd, WordCount.class);
wordsDataFrame.registerTempTable("words");
return null;
}
});
wordsDstream.map(new Function<Tuple2<String,Integer>,String>(){
#Override
public String call(Tuple2<String, Integer> v1) throws Exception {
// Below sql context becomes null when run on a master instead of local.
SQLContext sqlContext = JavaSQLContextSingleton.getInstance();
DataFrame counterpartyIds = sqlContext.sql("select * from words where word ='"+v1._1()+"'");
Row[] rows = counterpartyIds.cache().collect();
if(rows.length>0){
Row row = rows[0];
return row.getInt(0)+"-"+ row.getString(1);
} else {
return "";
}
}
}).print();
ssc.start();
ssc.awaitTermination();
}
}
class JavaSQLContextSingleton {
static private transient SQLContext instance = null;
static public SQLContext getInstance(SparkContext sparkContext) {
if (instance == null) {
instance = new SQLContext(sparkContext);
}
return instance;
}
}
import java.io.Serializable;
public class WordCount implements Serializable{
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
String word;
public WordCount(String word, int count) {
super();
this.word = word;
this.count = count;
}
int count;
}
The SparkContext (and thus the SQLContext) is only available in the Driver and not serialized to the Workers. Your program works in local since it is running in the context of the driver where the context is available.
Kafka already filled with 1 crores of messages, then i execute this code base, as per expectation it should give the count of the tuple/messages processed in 2 sec window, but it return the complete message count i.e. 1 crore in the first time and further it gives 0,0,0 ...Although it is taking more than 2 seconds to print that message ..
Inline source code is
import java.io.Serializable;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import kafka.serializer.StringDecoder;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import scala.Tuple2;
public class Test implements Serializable {
private static final long serialVersionUID = -5863692754478476225L;
private static final String KEY_SPARK_MASTER = "spark://machine1-1467:7077";
private static final String KEY_APP_NAME = "SQLWordCount";
private static final String KEY_TOPIC = "add104";
private static JavaStreamingContext streamingContext = null;
private Test() {
disableLogging();
doInit();
process();
}
public static void main(String[] params) {
System.out.println("------------Started---------------" + new Date().toString());
new Test();
}
private void disableLogging() {
Logger.getLogger("org").setLevel(Level.OFF);
Logger.getLogger("akka").setLevel(Level.OFF);
}
private void doInit() {
SparkConf sparkConf = new SparkConf().setMaster(KEY_SPARK_MASTER).setAppName(KEY_APP_NAME);
streamingContext = new JavaStreamingContext(sparkConf, new Duration(500));
streamingContext.checkpoint("/home/impadmin/checkpoint");
}
private HashMap<String, String> getKafkaParams() {
HashMap<String, String> kafkaParams = new HashMap<String, String>();
kafkaParams.put("metadata.broker.list", "localhost:9092");
kafkaParams.put("auto.offset.reset", "smallest");
kafkaParams.put("group.id", "id7");
return kafkaParams;
}
private HashSet<String> getTopicSet() {
HashSet<String> topic = new HashSet<String>(Arrays.asList(KEY_TOPIC));
return topic;
}
private void process() {
try {
JavaPairInputDStream<String, String> messages = KafkaUtils
.createDirectStream(streamingContext, String.class,
String.class, StringDecoder.class,
StringDecoder.class, getKafkaParams(),
getTopicSet());
JavaPairDStream<String, String> windowedStream = messages.window(
new Duration(2000), new Duration(2000));
JavaDStream<String> lines = windowedStream
.map(new Function<Tuple2<String, String>, String>() {
public String call(Tuple2<String, String> tuple2) {
return tuple2._2();
}
});
lines.foreachRDD(new Function<JavaRDD<String>, Void>() {
public Void call(JavaRDD<String> rdd) throws Exception {
System.out.println(new Date().toString() + " In the Call method" + rdd.count());
JavaRDD<Stock> rowRDD = rdd
.map(new Function<String, Stock>() {
#Override
public Stock call(String arg0) throws Exception {
return null;
}
});
return null;
};
});
streamingContext.start();
streamingContext.awaitTermination();
} catch (Exception e) {
System.out.println("Exception: " + e.toString());
}
}
}
Because you are using kafkaParams.put("auto.offset.reset", "smallest");, it is going back and bringing all the msgs.
Change is to kafkaParams.put("auto.offset.reset", "largest"); to consume only new msgs.
If your expectation is streaming context is going to chunk up all the msgs into 2 sec batches, I doubt it will do that. However, you can set offset range and read all existing data in multiple batches.
However, new msgs will be batched in every 2 sec or whatever interval you set up