Union a List of Flume Receivers in Spark Streaming

Union a List of Flume Receivers in Spark Streaming - apache-spark

In order to increase parallelism as recommended in the Spark Streaming Programming guide I'm setting up multiple receivers and trying to union a list of them. This code works as expected:
private JavaDStream<SparkFlumeEvent> getEventsWorking(JavaStreamingContext jssc, List<String> hosts, List<String> ports) {
List<JavaReceiverInputDStream<SparkFlumeEvent>> receivers = new ArrayList<>();
for (String host : hosts) {
for (String port : ports) {
receivers.add(FlumeUtils.createStream(jssc, host, Integer.parseInt(port)));
}
}
JavaDStream<SparkFlumeEvent> unionStreams = receivers.get(0)
.union(receivers.get(1))
.union(receivers.get(2))
.union(receivers.get(3))
.union(receivers.get(4))
.union(receivers.get(5));
return unionStreams;
}
But I don't actually know how many receivers my cluster will have until runtime. When I try to do this in a loop I get an NPE.
private JavaDStream<SparkFlumeEvent> getEventsNotWorking(JavaStreamingContext jssc, List<String> hosts, List<String> ports) {
List<JavaReceiverInputDStream<SparkFlumeEvent>> receivers = new ArrayList<>();
for (String host : hosts) {
for (String port : ports) {
receivers.add(FlumeUtils.createStream(jssc, host, Integer.parseInt(port)));
}
}
JavaDStream<SparkFlumeEvent> unionStreams = null;
for (JavaReceiverInputDStream<SparkFlumeEvent> receiver : receivers) {
if (unionStreams == null) {
unionStreams = receiver;
} else {
unionStreams.union(receiver);
}
}
return unionStreams;
}
ERROR:
16/09/15 17:05:25 ERROR JobScheduler: Error in job generator
java.lang.NullPointerException
at org.apache.spark.streaming.DStreamGraph$$anonfun$getMaxInputStreamRememberDuration$2.apply(DStreamGraph.scala:172)
at org.apache.spark.streaming.DStreamGraph$$anonfun$getMaxInputStreamRememberDuration$2.apply(DStreamGraph.scala:172)
at scala.collection.TraversableOnce$$anonfun$maxBy$1.apply(TraversableOnce.scala:225)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
at scala.collection.IndexedSeqOptimized$class.reduceLeft(IndexedSeqOptimized.scala:68)
at scala.collection.mutable.ArrayBuffer.reduceLeft(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.maxBy(TraversableOnce.scala:225)
at scala.collection.AbstractTraversable.maxBy(Traversable.scala:105)
at org.apache.spark.streaming.DStreamGraph.getMaxInputStreamRememberDuration(DStreamGraph.scala:172)
at org.apache.spark.streaming.scheduler.JobGenerator.clearMetadata(JobGenerator.scala:270)
at org.apache.spark.streaming.scheduler.JobGenerator.org$apache$spark$streaming$scheduler$JobGenerator$$processEvent(JobGenerator.scala:182)
at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:87)
at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:86)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
16/09/15 17:05:25 INFO MemoryStore: ensureFreeSpace(15128) called with
curMem=520144, maxMem=555755765 16/09/15 17:05:25 INFO MemoryStore:
Block broadcast_24 stored as values in memory (estimated size 14.8 KB,
free 529.5 MB) Exception in thread "main"
java.lang.NullPointerException
at org.apache.spark.streaming.DStreamGraph$$anonfun$getMaxInputStreamRememberDuration$2.apply(DStreamGraph.scala:172)
at org.apache.spark.streaming.DStreamGraph$$anonfun$getMaxInputStreamRememberDuration$2.apply(DStreamGraph.scala:172)
at scala.collection.TraversableOnce$$anonfun$maxBy$1.apply(TraversableOnce.scala:225)
at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:51)
at scala.collection.IndexedSeqOptimized$class.reduceLeft(IndexedSeqOptimized.scala:68)
at scala.collection.mutable.ArrayBuffer.reduceLeft(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.maxBy(TraversableOnce.scala:225)
at scala.collection.AbstractTraversable.maxBy(Traversable.scala:105)
at org.apache.spark.streaming.DStreamGraph.getMaxInputStreamRememberDuration(DStreamGraph.scala:172)
at org.apache.spark.streaming.scheduler.JobGenerator.clearMetadata(JobGenerator.scala:270)
at org.apache.spark.streaming.scheduler.JobGenerator.org$apache$spark$streaming$scheduler$JobGenerator$$processEvent(JobGenerator.scala:182)
at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:87)
at org.apache.spark.streaming.scheduler.JobGenerator$$anon$1.onReceive(JobGenerator.scala:86)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
What's the correct way to do this?

Can you please try out the below code, It would solve your problem:
private JavaDStream<SparkFlumeEvent> getEventsNotWorking(JavaStreamingContext jssc, List<String> hosts, List<String> ports) {
List<JavaDStream<SparkFlumeEvent>> receivers = new ArrayList<JavaDStream<SparkFlumeEvent>>();
for (String host : hosts) {
for (String port : ports) {
receivers.add(FlumeUtils.createStream(jssc, host, Integer.parseInt(port)));
}
}
return jssc.union(receivers.get(0), receivers.subList(1, receivers.size()));;
}

Related

Multithreaded Kafka Consumer not processing all the partitions in parallel

I have created a multithreaded Kafka consumer in which one thread is assigned to each of the partition (I have total 100 partitions). I have followed https://cwiki.apache.org/confluence/display/KAFKA/Consumer+Group+Example link.
Below is the init method of my consumer.
consumer = kafka.consumer.Consumer.createJavaConsumerConnector(createConsumerConfig());
System.out.println("Kafka Consumer initialized.");
Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
topicCountMap.put(topicName, 100);
Map<String, List<KafkaStream<byte[], byte[]>>> consumerMap = consumer.createMessageStreams(topicCountMap);
List<KafkaStream<byte[], byte[]>> streams = consumerMap.get(topicName);
executor = Executors.newFixedThreadPool(100);
In the above init method, I got the list of Kafka streams (total 100) which should be connected to each of the partition (Which is happening as expected).
Then I did submit each of the streams to a different thread using below snippet.
public Object call() {
for (final KafkaStream stream : streams) {
executor.execute(new StreamWiseConsumer(stream));
}
return true;
}
Below is the StreamWiseConsumer class.
public class StreamWiseConsumer extends Thread {
ConsumerIterator<byte[], byte[]> consumerIterator;
private KafkaStream m_stream;
public StreamWiseConsumer(ConsumerIterator<byte[], byte[]> consumerIterator) {
this.consumerIterator = consumerIterator;
}
public StreamWiseConsumer(KafkaStream kafkaStream) {
this.m_stream = kafkaStream;
}
#Override
public void run() {
ConsumerIterator<byte[], byte[]> consumerIterator = m_stream.iterator();
while(!Thread.currentThread().isInterrupted() && !interrupted) {
try {
if (consumerIterator.hasNext()) {
String reqId = UUID.randomUUID().toString();
System.out.println(reqId+ " : Event received by threadId : "+Thread.currentThread().getId());
MessageAndMetadata<byte[], byte[]> messageAndMetaData = consumerIterator.next();
byte[] keyBytes = messageAndMetaData.key();
String key = null;
if (keyBytes != null) {
key = new String(keyBytes);
}
byte[] eventBytes = messageAndMetaData.message();
if (eventBytes == null){
System.out.println("Topic: No event fetched for transaction Id:" + key);
continue;
}
String event = new String(eventBytes).trim();
// Some Processing code
System.out.println(reqId+" : Processing completed for threadId = "+Thread.currentThread().getId());
consumer.commitOffsets();
} catch (Exception ex) {
}
}
}
}
Ideally, it should start processing from all the 100 partitions in parallel. But it is picking some random number of events from one of the threads and processing it then some other thread starts processing from another partition. It seems like it's sequential processing but with different-different threads. I was expecting processing to happen from all the 100 threads. Am I missing something here?
PFB for the logs link.
https://drive.google.com/file/d/14b7gqPmwUrzUWewsdhnW8q01T_cQ30ES/view?usp=sharing
https://drive.google.com/file/d/1PO_IEsOJFQuerW0y-M9wRUB-1YJuewhF/view?usp=sharing

I doubt whether this is the right approach for vertically scaling kafka streams.
Kafka streams inherently supports multi thread consumption.
Increase the number of threads used for processing by using num.stream.threads configuration.
If you want 100 threads to process the 100 partitions, set num.stream.threads as 100.

Converting UnixTimestamp to TIMEUUID for Cassandra

I'm learning all about Apache Cassandra 3.x.x and I'm trying to develop some stuff to play around. The problem is that I want to store data into a Cassandra table which contains these columns:
id (UUID - Primary Key) | Message (TEXT) | REQ_Timestamp (TIMEUUID) | Now_Timestamp (TIMEUUID)
REQ_Timestamp has the time when the message left the client at frontend level. Now_Timestamp, on the other hand, is the time when the message is finally stored in Cassandra. I need both timestamps because I want to measure the amount of time it takes to handle the request from its origin until the data is safely stored.
Creating the Now_Timestamp is easy, I just use the now() function and it generates the TIMEUUID automatically. The problem arises with REQ_Timestamp. How can I convert that Unix Timestamp to a TIMEUUID so Cassandra can store it? Is this even possible?
The architecture of my backend is this: I get the data in a JSON from the frontend to a web service that process it and stores it in Kafka. Then, a Spark Streaming job takes that Kafka log and puts it in Cassandra.
This is my WebService that puts the data in Kafka.
#Path("/")
public class MemoIn {
#POST
#Path("/in")
#Consumes(MediaType.APPLICATION_JSON)
#Produces(MediaType.TEXT_PLAIN)
public Response goInKafka(InputStream incomingData){
StringBuilder bld = new StringBuilder();
try {
BufferedReader in = new BufferedReader(new InputStreamReader(incomingData));
String line = null;
while ((line = in.readLine()) != null) {
bld.append(line);
}
} catch (Exception e) {
System.out.println("Error Parsing: - ");
}
System.out.println("Data Received: " + bld.toString());
JSONObject obj = new JSONObject(bld.toString());
String line = obj.getString("id_memo") + "|" + obj.getString("id_writer") +
"|" + obj.getString("id_diseased")
+ "|" + obj.getString("memo") + "|" + obj.getLong("req_timestamp");
try {
KafkaLogWriter.addToLog(line);
} catch (Exception e) {
e.printStackTrace();
}
return Response.status(200).entity(line).build();
}
}
Here's my Kafka Writer
package main.java.vcemetery.webservice;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.util.Properties;
import org.apache.kafka.clients.producer.Producer;
public class KafkaLogWriter {
public static void addToLog(String memo)throws Exception {
// private static Scanner in;
String topicName = "MemosLog";
/*
First, we set the properties of the Kafka Log
*/
Properties props = new Properties();
props.put("bootstrap.servers", "localhost:9092");
props.put("acks", "all");
props.put("retries", 0);
props.put("batch.size", 16384);
props.put("linger.ms", 1);
props.put("buffer.memory", 33554432);
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
// We create the producer
Producer<String, String> producer = new KafkaProducer<>(props);
// We send the line into the producer
producer.send(new ProducerRecord<>(topicName, memo));
// We close the producer
producer.close();
}
}
And finally here's what I have of my Spark Streaming job
public class MemoStream {
public static void main(String[] args) throws Exception {
Logger.getLogger("org").setLevel(Level.ERROR);
Logger.getLogger("akka").setLevel(Level.ERROR);
// Create the context with a 1 second batch size
SparkConf sparkConf = new SparkConf().setAppName("KafkaSparkExample").setMaster("local[2]");
JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(10));
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "localhost:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "group1");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
/* Se crea un array con los tópicos a consultar, en este caso solamente un tópico */
Collection<String> topics = Arrays.asList("MemosLog");
final JavaInputDStream<ConsumerRecord<String, String>> kafkaStream =
KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams)
);
kafkaStream.mapToPair(record -> new Tuple2<>(record.key(), record.value()));
// Split each bucket of kafka data into memos a splitable stream
JavaDStream<String> stream = kafkaStream.map(record -> (record.value().toString()));
// Then, we split each stream into lines or memos
JavaDStream<String> memos = stream.flatMap(x -> Arrays.asList(x.split("\n")).iterator());
/*
To split each memo into sections of ids and messages, we have to use the code \\ plus the character
*/
JavaDStream<String> sections = memos.flatMap(y -> Arrays.asList(y.split("\\|")).iterator());
sections.print();
sections.foreachRDD(rdd -> {
rdd.foreachPartition(partitionOfRecords -> {
//We establish the connection with Cassandra
Cluster cluster = null;
try {
cluster = Cluster.builder()
.withClusterName("VCemeteryMemos") // ClusterName
.addContactPoint("127.0.0.1") // Host IP
.build();
} finally {
if (cluster != null) cluster.close();
}
while(partitionOfRecords.hasNext()){
}
});
});
ssc.start();
ssc.awaitTermination();
}
}
Thank you in advance.

Cassandra has no function to convert from UNIX timestamp. You have to do the conversion on client side.
Ref: https://docs.datastax.com/en/cql/3.3/cql/cql_reference/timeuuid_functions_r.html

MapWithState gives java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast while recovering from checkpoint

I am facing an issue with spark streaming job where i am trying to use broadcast, mapWithState and checkpointing together in spark.
Following is the usage:
Since I have to pass some connection object (which is not Serializable) to the executors, I am using org.apache.spark.broadcast.Broadcast
Since we have to maintain some cached information i am using stateful streaming with mapWithState
Also I am using checkpointing of my streaming context
I also need to pass the broadcasted connection object into the mapWithState for fetching some data from an external source.
The flow is working just fine when the context is created newly. However when i crash the application and try to recover from checkpoint I get a ClassCastException.
I have put a small code snippet based on an example from asyncified.io to reproduce the issue in github:
My broadcast logic is yuvalitzchakov.utils.KafkaWriter.scala
The dummy logic of the application is yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast.scala
Dummy snippet of the code:
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("spark-stateful-example")
...
val prop = new Properties()
...
val config: Config = ConfigFactory.parseString(prop.toString)
val sc = new SparkContext(sparkConf)
val ssc = StreamingContext.getOrCreate(checkpointDir, () => {
println("creating context newly")
clearCheckpoint(checkpointDir)
val streamingContext = new StreamingContext(sc, Milliseconds(batchDuration))
streamingContext.checkpoint(checkpointDir)
...
val kafkaWriter = SparkContext.getOrCreate().broadcast(kafkaErrorWriter)
...
val stateSpec = StateSpec.function((key: Int, value: Option[UserEvent], state: State[UserSession]) =>
updateUserEvents(key, value, state, kafkaWriter)).timeout(Minutes(jobConfig.getLong("timeoutInMinutes")))
kafkaTextStream
.transform(rdd => {
offsetsQueue.enqueue(rdd.asInstanceOf[HasOffsetRanges].offsetRanges)
rdd
})
.map(deserializeUserEvent)
.filter(_ != UserEvent.empty)
.mapWithState(stateSpec)
.foreachRDD { rdd =>
...
some logic
...
streamingContext
})
}
ssc.start()
ssc.awaitTermination()
def updateUserEvents(key: Int,
value: Option[UserEvent],
state: State[UserSession],
kafkaWriter: Broadcast[KafkaWriter]): Option[UserSession] = {
...
kafkaWriter.value.someMethodCall()
...
}
I get the following error when
kafkaWriter.value.someMethodCall()
is executed:
17/08/01 21:20:38 ERROR Executor: Exception in task 2.0 in stage 3.0 (TID 4)
java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to yuvalitzchakov.utils.KafkaWriter
at yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast$.updateUserSessions$1(SparkStatefulRunnerWithBroadcast.scala:144)
at yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast$.updateUserEvents(SparkStatefulRunnerWithBroadcast.scala:150)
at yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast$$anonfun$2.apply(SparkStatefulRunnerWithBroadcast.scala:78)
at yuvalitzchakov.stateful.SparkStatefulRunnerWithBroadcast$$anonfun$2.apply(SparkStatefulRunnerWithBroadcast.scala:77)
at org.apache.spark.streaming.StateSpec$$anonfun$1.apply(StateSpec.scala:181)
at org.apache.spark.streaming.StateSpec$$anonfun$1.apply(StateSpec.scala:180)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$$anonfun$updateRecordWithData$1.apply(MapWithStateRDD.scala:57)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$$anonfun$updateRecordWithData$1.apply(MapWithStateRDD.scala:55)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at org.apache.spark.streaming.rdd.MapWithStateRDDRecord$.updateRecordWithData(MapWithStateRDD.scala:55)
at org.apache.spark.streaming.rdd.MapWithStateRDD.compute(MapWithStateRDD.scala:159)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:336)
at org.apache.spark.rdd.RDD$$anonfun$8.apply(RDD.scala:334)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Basically kafkaWriter is the broadcast variable and kafkaWriter.value should return us the broadcasted variable but it is returning SerializableCongiguration which is not getting casted to the desired object
Thanks in advance for help!

Broadcast variable cannot be used with MapwithState(transformation operations in general) if we need to recover from checkpoint directory in Spark streaming. It can only be used inside output operations in that case as it requires Spark context to lazily initialize the broadcast
class JavaWordBlacklist {
private static volatile Broadcast<List<String>> instance = null;
public static Broadcast<List<String>> getInstance(JavaSparkContext jsc) {
if (instance == null) {
synchronized (JavaWordBlacklist.class) {
if (instance == null)
{ List<String> wordBlacklist = Arrays.asList("a", "b", "c"); instance = jsc.broadcast(wordBlacklist); }
}
}
return instance;
}
}
class JavaDroppedWordsCounter {
private static volatile LongAccumulator instance = null;
public static LongAccumulator getInstance(JavaSparkContext jsc) {
if (instance == null) {
synchronized (JavaDroppedWordsCounter.class) {
if (instance == null)
{ instance = jsc.sc().longAccumulator("WordsInBlacklistCounter"); }
}
}
return instance;
}
}
wordCounts.foreachRDD((rdd, time) -> {
// Get or register the blacklist Broadcast
Broadcast<List<String>> blacklist = JavaWordBlacklist.getInstance(new JavaSparkContext(rdd.context()));
// Get or register the droppedWordsCounter Accumulator
LongAccumulator droppedWordsCounter = JavaDroppedWordsCounter.getInstance(new JavaSparkContext(rdd.context()));
// Use blacklist to drop words and use droppedWordsCounter to count them
String counts = rdd.filter(wordCount -> {
if (blacklist.value().contains(wordCount._1()))
{ droppedWordsCounter.add(wordCount._2()); return false; }
else
{ return true; }
}).collect().toString();
String output = "Counts at time " + time + " " + counts;
}

how to check the number of entries on local member

my prime member
public static void main(String[] args) throws InterruptedException {
Config config = new Config();
config.setProperty(GroupProperty.ENABLE_JMX, "true");
config.setProperty(GroupProperty.BACKPRESSURE_ENABLED, "true");
config.setProperty(GroupProperty.SLOW_OPERATION_DETECTOR_ENABLED, "true");
config.getSerializationConfig().addPortableFactory(1, new MyPortableFactory());
HazelcastInstance hz = Hazelcast.newHazelcastInstance(config);
IMap<Integer, Rule> ruleMap = hz.getMap("ruleMap");
// TODO generate rule map data ; more than 100,000 entries
generateRuleMapData(ruleMap);
logger.info("generate rule finised!");
// TODO rule map index
// health check
PartitionService partitionService = hz.getPartitionService();
LocalMapStats mapStatistics = ruleMap.getLocalMapStats();
while (true) {
logger.info("isClusterSafe:{},isLocalMemberSafe:{},number of entries owned on this node = {}",
partitionService.isClusterSafe(), partitionService.isLocalMemberSafe(),
mapStatistics.getOwnedEntryCount());
Thread.sleep(1000);
}
}
logs
2016-06-28 13:53:05,048 INFO [main] b.PrimeMember (PrimeMember.java:41) - isClusterSafe:true,isLocalMemberSafe:true,number of entries owned on this node = 997465
2016-06-28 13:53:06,049 INFO [main] b.PrimeMember (PrimeMember.java:41) - isClusterSafe:true,isLocalMemberSafe:true,number of entries owned on this node = 997465
2016-06-28 13:53:07,050 INFO [main] b.PrimeMember (PrimeMember.java:41) - isClusterSafe:true,isLocalMemberSafe:true,number of entries owned on this node = 997465
my slave member
public static void main(String[] args) throws InterruptedException {
Config config = new Config();
config.setProperty(GroupProperty.ENABLE_JMX, "true");
config.setProperty(GroupProperty.BACKPRESSURE_ENABLED, "true");
config.setProperty(GroupProperty.SLOW_OPERATION_DETECTOR_ENABLED, "true");
HazelcastInstance hz = Hazelcast.newHazelcastInstance(config);
IMap<Integer, Rule> ruleMap = hz.getMap("ruleMap");
PartitionService partitionService = hz.getPartitionService();
LocalMapStats mapStatistics = ruleMap.getLocalMapStats();
while (true) {
logger.info("isClusterSafe:{},isLocalMemberSafe:{},number of entries owned on this node = {}",
partitionService.isClusterSafe(), partitionService.isLocalMemberSafe(),
mapStatistics.getOwnedEntryCount());
Thread.sleep(1000);
}
}
logs
2016-06-28 14:05:53,543 INFO [main] b.SlaveMember (SlaveMember.java:31) - isClusterSafe:false,isLocalMemberSafe:false,number of entries owned on this node = 412441
2016-06-28 14:05:54,556 INFO [main] b.SlaveMember (SlaveMember.java:31) - isClusterSafe:false,isLocalMemberSafe:false,number of entries owned on this node = 412441
2016-06-28 14:05:55,563 INFO [main] b.SlaveMember (SlaveMember.java:31) - isClusterSafe:false,isLocalMemberSafe:false,number of entries owned on this node = 412441
2016-06-28 14:05:56,578 INFO [main] b.SlaveMember (SlaveMember.java:31) - isClusterSafe:false,isLocalMemberSafe:false,number of entries owned on this node = 412441
my question is :
why number of entries owned on prime member is not changed, after the cluster adds one slave member?

I should get statics per second.
while (true) {
LocalMapStats mapStatistics = ruleMap.getLocalMapStats();
logger.info(
"isClusterSafe:{},isLocalMemberSafe:{},rulemap.size:{}, number of entries owned on this node = {}",
partitionService.isClusterSafe(), partitionService.isLocalMemberSafe(), ruleMap.size(),
mapStatistics.getOwnedEntryCount());
Thread.sleep(1000);
}

Another option is to make use of localKeySet which returns the locally owned set of keys.
IMap::localKeySet.size()

Apache Thrift RejectedExecutionException

My application consists of PHP client and Java backend wired by Apache Thrift, and I faced with problem with Thrift thread pool.
In case I run PHP tests which send requests via Thrift to backend I get an exception
TSocket: timed out reading 4 bytes from localhost:34567
And at the same time I see the following in logs on Java side:
java.util.concurrent.RejectedExecutionException: Task org.apache.thrift.server.TThreadPoolServer$WorkerProcess#17a0d6f rejected from java.util.concurrent.ThreadPoolExecutor#80080[Running, pool size = 50, active threads = 50, queued tasks = 0, completed tasks = 0]
So from my point of view it means that single-threaded synchronous PHP application uses all the configured threads (50), and it's strange.
I use Apache Thrift 0.9.0, PHP 5.4.13 and Java 1.7.0_45
Java server code:
public class ThriftServer implements TransportServer {
private static Logger logger = LoggerFactory.getLogger(ThriftServer.class);
private final TServer server;
private final SprootService service;
private final int port;
public ThriftServer(Integer thriftPort) throws IOException, ThriftException, ConfigurationException, ConfigValidationException {
this.service = new SprootService();
this.port = thriftPort;
try {
TProcessor processor = new TSprootService.Processor<SprootService>(service);
TServerSocket socket = new TServerSocket(port);
TBinaryProtocol.Factory protocolFactory = new TBinaryProtocol.Factory(true, true);
ThreadPoolServerArgs targs = new ThreadPoolServerArgs(socket, processor, protocolFactory, 5, 50);
this.server = new TThreadPoolServer(targs);
} catch (TTransportException tte) {
throw new ThriftException("Exception has occured during attempt to create Thrift server on port " + port, tte);
}
}
public void start() {
logger.info("Thrift server is started on port " + port);
System.out.println("Thrift server is started on port " + port);
server.serve();
}
}
PHP client code:
class SprootClient {
private $thriftClient = null;
private $mappersRegistry = null;
private $tSocket = null;
public function __construct($host, $port) {
$this->thriftClient = $this->initThriftClient($host, $port);
$this->mappersRegistry = new MappersRegistry();
}
public function __destruct() {
$this->tSocket->close();
}
private function initThriftClient($host, $port) {
settype($port, 'integer');
$this->tSocket = new TSocket($host, $port);
$this->tSocket->setRecvTimeout(3000);
try {
$this->tSocket -> open();
$protocol = new TBinaryProtocol($this->tSocket);
return new TSprootServiceClient($protocol);
} catch (Exception $e) {
throw new SprootClientException($e->getMessage()." ".$e->getTraceAsString());
}
}
private function getThriftClient() {
return $this->thriftClient;
}
public function put($cacheName, $key, $domainObject) {
if ($domainObject == null || $key == null) {
throw new SprootClientException("Both key and value cannot be null or empty");
}
try {
$mapper = $this->mappersRegistry->getDomainMapper($cacheName);
$dataObject = $mapper->toTDataObject($domainObject);
$this->getThriftClient()->put($cacheName, $key, $dataObject);
} catch (Exception $e) {
throw new SprootClientException($e->getMessage()." ".$e->getTraceAsString());
}
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Union a List of Flume Receivers in Spark Streaming - apache-spark

Related

Multithreaded Kafka Consumer not processing all the partitions in parallel

Converting UnixTimestamp to TIMEUUID for Cassandra

MapWithState gives java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast while recovering from checkpoint

how to check the number of entries on local member

Apache Thrift RejectedExecutionException

Categories

Resources