Node kafka consumer to process messages in sequence - node.js

I have a kafka topic that I want to consume with a node app. The node app must process the messages from the topic in sequence, one by one, not many at the same time.
I tried this kind of code but this is not doing what I want. When there is messages in the topic waiting for processing and this code is started the on 'message' event gets triggered immediately for all the the messages. The first message gets mutex lock first but the rest of the messages are processed in random order.
var mutex = require( 'node-mutex' )();
var crypto = require('crypto');
var mutexToken = crypto.randomBytes(64).toString('hex');
var kafka = require('kafka-node');
var Consumer = kafka.Consumer;
var client = new kafka.Client('localhost:2181');
var consumer = new Consumer(
client,
[
{ topic: 'my_topic' }
]
);
consumer.on('message', function(message) {
console.log("new message")
mutex
.lock( mutexToken )
.then( function( unlock ) {
console.log(message);
unlock();
} );
});
Is it possible to consume the messages one by one, synchronously? Maybe with some other library?

I believe you can control the message offset directly by explicitly disabling the autoCommit feature.
Here is the link to the consumer documentation:
https://www.npmjs.com/package/kafka-node#highlevelconsumer
Here is a link to an example with autoCommit set off:
https://github.com/SOHU-Co/kafka-node/blob/master/example/consumer.js
https://github.com/SOHU-Co/kafka-node/blob/master/example/offset.js
I have never needed to disable the autoCommit feature, so I can't speak to the implementation.
From the test code it looks like it should be something along these lines:
var Offset = kafka.Offset;
var offset = new Offset(client);
var topics = [ { topic: EXISTS_TOPIC_2 } ];
var options = { autoCommit: false, groupId: '_groupId_1_test' };
var consumer = new Consumer(client, topics, options);
var count = 0;
consumer.on('error', noop);
consumer.on('offsetOutOfRange', function (topic) {
offsetOutOfRange(topic, this);
});
consumer.on('message', function (message) {
message.topic.should.equal(EXISTS_TOPIC_2);
message.value.should.equal('hello kafka');
message.partition.should.equal(0);
offset.commit('_groupId_1_test', [message], function (err) {
if (count++ === 0) done(err);
});
});
OK I looked at the API a little more and thought this might eb an angle worth investigating for you:
Consumer.prototype.pauseTopics = function (topics) {
if (!this.pausedPayloads) this.pausedPayloads = [];
pauseOrResume(this.payloads, this.pausedPayloads, topics);
};
Consumer.prototype.resumeTopics = function (topics) {
if (!this.pausedPayloads) this.pausedPayloads = [];
var reFetch = !this.payloads.length;
pauseOrResume(this.pausedPayloads, this.payloads, topics);
reFetch = reFetch && this.payloads.length;
if (reFetch) this.fetch();
};
from the documentation:
pause()
Pause the consumer. Calling pause does not automatically stop messages
from being emitted. This is because pause just stops the kafka
consumer fetch loop. Each iteration of the fetch loop can obtain a
batch of messages (limited by fetchMaxBytes).
So if you only fetch one message (perhaps your bytes are small enough that a max fetch is only 1), then pause will stop the next fetch from happening. But if you fetched multiple messages, pause will not prevent more than one message being emitted.
I think to be 100% certain you would need to write the logic to handle the messages synchronously. Perhaps emit messages into a queue, and process off of the queue?

Related

Node.js application listens to message queue and add messages to redis asynchronously

I am developing a node.js component that listens to a message queue (ActiveMQ) and adds the received messages to redis in batches (must be 20 per batch).
There is no problem when number of messages received from ActiveMQ is 10 per second or less.
My problem is that messages are added to the queue every 4 milliseconds. Which causes the number of records added to the batch to be sometimes more than 20 per batch.
const stompit = require('stompit');
var uuid = require('node-uuid');
var Redis = require('ioredis');
var redis = new Redis();
var pipeline = redis.pipeline();
var batchCounter = 0;
stompit.connect({ host: 'localhost', port: 61613 }, function(err1, client) {
client.subscribe({ destination: 'MyQueue' }, function(err2, msg) {
msg.readString('UTF-8', function(err3, body) {
if (batchCounter >= 20){
pipeline.exec(function(err4, results) {
pipeline = redis.pipeline();
batchCounter = 0;
console.log(results);
});
}
batchCounter++;
pipeline.set(uuid.v1(), JSON.stringify(body));
//client.disconnect();
});
});
});
How can fix this problem?
Thank you
Try resetting the pipeline before calling the .exec method, which I assume is an asynchronous method. Because .exec runs at some point in the future, the increment and pipeline.set can run before it.
The following code saves the current pipeline and creates a new one synchronously before the .exec
if (batchCounter >= 20){
let fullpipeline = pipeline;
pipeline = redis.pipeline();
batchCounter = 0;
fullpipeline.exec(function(err4, results) {
console.log(err4, results);
});
}
Then new messages should then only be appended to the new pipeline.
I ended up controlling number of records per batch using flags. I believe there is another possible, probably more efficient, fix to this which is controlling the process of reading from ActiveMQ. But, in this situation, flags did the job for me. Complete code using flags is in the following link:
https://github.com/TamerB/ToActiveMQToRedis/blob/master/consumer/app.js

How to work around amqplib's Channel#consume odd signature?

I am writing a worker that uses amqplib's Channel#consume method. I want this worker to wait for jobs and process them as soon as they appear in the queue.
I wrote my own module to abstract away ampqlib, here are the relevant functions for getting a connection, setting up the queue and consuming a message:
const getConnection = function(host) {
return amqp.connect(host);
};
const createChannel = function(conn) {
connection = conn;
return conn.createConfirmChannel();
};
const assertQueue = function(channel, queue) {
return channel.assertQueue(queue);
};
const consume = Promise.method(function(channel, queue, processor) {
processor = processor || function(msg) { if (msg) Promise.resolve(msg); };
return channel.consume(queue, processor)
});
const setupQueue = Promise.method(function setupQueue(queue) {
const amqp_host = 'amqp://' + ((host || process.env.AMQP_HOST) || 'localhost');
return getConnection(amqp_host)
.then(conn => createChannel(conn)) // -> returns a `Channel` object
.tap(channel => assertQueue(channel, queue));
});
consumeJob: Promise.method(function consumeJob(queue) {
return setupQueue(queue)
.then(channel => consume(channel, queue))
});
My problem is with Channel#consume's odd signature. From http://www.squaremobius.net/amqp.node/channel_api.html#channel_consume:
#consume(queue, function(msg) {...}, [options, [function(err, ok) {...}]])
The callback is not where the magic happens, the message's processing should actually go in the second argument and that breaks the flow of promises.
This is how I planned on using it:
return queueManager.consumeJob(queue)
.then(msg => {
// do some processing
});
But it doesn't work. If there are no messages in the queue, the promise is rejected and then if a message is dropped in the queue nothing happens. If there is a message, only one message is processed and then the worker stalls because it exited the "processor" function from the Channel#consume call.
How should I go about it? I want to keep the queueManager abstraction so my code is easier to reason about but I don't know how to do it... Any pointers?
As #idbehold said, Promises can only be resolved once. If you want to process messages as they come in, there is no other way than to use this function. Channel#get will only check the queue once and then return; it wouldn't work for a scenario where you need a worker.
just as an option. You can present your application as a stream of some messages(or events). There is a library for this http://highlandjs.org/#examples
Your code should look like this(it isn`t a finished sample, but I hope it illustrates the idea):
let messageStream = _((push, next) => {
consume(queue, (msg) => {
push(null, msg)
})
)
// now you can operate with your stream in functional style
message.map((msg) => msg + 'some value').each((msg) => // do something with msg)
This approach provides you a lot of primitives for synchronization and transformation
http://highlandjs.org/#examples

How to consume just one message from rabbit mq on nodejs

Im using amqp.node library to integrate rabbitmq into my system.
But in consumer i want to process just one message at the time, then acknowledge the message then consume the next message from the queue.
The current code is:
// Consumer
open.then(function(conn) {
var ok = conn.createChannel();
ok = ok.then(function(ch) {
ch.assertQueue(q);
ch.consume(q, function(msg) {
if (msg !== null) {
othermodule.processMessage(msg, function(error, response){
console.log(msg.content.toString());
ch.ack(msg);
});
}
});
});
return ok;
}).then(null, console.warn);
The ch.consume will process all messages in the channel at one time and the function of the module call it here othermodule will not be executed in the same time line.
I want to wait for the othermodule function to finish before consume the next message in the queue.
At this moment (2018), I think RabbitMQ team has an option to do that:
https://www.rabbitmq.com/tutorials/tutorial-two-javascript.html
ch.prefetch(1);
In order to defeat that we can use the prefetch method with the value
of 1. This tells RabbitMQ not to give more than one message to a
worker at a time. Or, in other words, don't dispatch a new message to
a worker until it has processed and acknowledged the previous one.
Instead, it will dispatch it to the next worker that is not still
busy.
Follow up the example here :
https://www.npmjs.com/package/amqplib
// Consumer
function consumer(conn) {
var ok = conn.createChannel(on_open);
function on_open(err, ch) {
if (err != null) bail(err);
ch.assertQueue(q);
// IMPORTANT
ch.prefetch(1);
ch.consume(q, function(msg) {
if (msg !== null) {
console.log(msg.content.toString());
ch.ack(msg);
}
});
}
}
Refs: http://www.squaremobius.net/amqp.node/channel_api.html#channel_prefetch
You need to set a prefetch value as shown in this example:
https://github.com/squaremo/amqp.node/blob/master/examples/tutorials/rpc_server.js#L22
When you create the model you need to set the QOS on it. Here is how we would do it in C#:
var _model = rabbitConnection.CreateModel();
// Configure the Quality of service for the model. Below is how what each setting means.
// BasicQos(0="Dont send me a new message untill I’ve finshed", _fetchSize = "Send me N messages at a time", false ="Apply to this Model only")
_model.BasicQos(0, _fetchSize, false);
var consumerTag = _model.BasicConsume(rabbitQueue.QueueName, false, _consumerName, queueingConsumer);
You have to set QoS = 1.
ch = ...
ch.qos(1);
ch.consume(q, msg => { ... });
(javascript)

How do I output a stream of tuples from a Storm spout with emit() and sync()?

(xpost github issue)
I'm new to Storm. I found the helpful node-storm library and I have successfully submitted topologies, but I can't get my spout to emit a stream of tuples.
node-storm's wordcount example works fine.
I want a spout that subscribes to a websocket and outputs any messages as tuples.
Here's my attempt so far. I think I have some misconfiguration, because I know I my wsEmitter is emitting future events, but my Storm UI shows zero spout emits.
I suspect that maybe I shouldn't be binding the listener inside the spout function?
Does this function get invoked multiple times? (looks like it... see https://github.com/RallySoftware/node-storm/blob/master/lib/spout.js#L4 )
What does sync actually do and when should I use it?
var storm = require('node-storm');
var wsEmitter = require('./wsEmitter.js')();
wsEmitter.init(); // subscribe to websocket
var futuresSpout = storm.spout(function(sync) {
var self = this;
console.log('subscribing to ws');
wsEmitter.on('future', function(data){ // websocket data arrived
self.emit([data]);
sync();
});
})
.declareOutputFields(["a"]);
Turns out I had two problems. First, my topology wasn't executing because one of my bolts (not shown) failed to set .declareOutputFields().
Second, I need to delay the emits from the spout until the supervisor asks for one emit with nextTick(). I did that by buffering any incoming messages until the supervisor calls the spout:
module.exports = (function(){
var storm = require('node-storm');
var wsEmitter = require('./wsEmitter.js')();
wsEmitter.init();
var queue = [];
var queueEmpty = true;
wsEmitter.on('thing', function(data){
var trade = JSON.parse(data);
trade.timeReported = new Date().valueOf();
queue.push(trade);
queueEmpty = false;
});
return storm.spout(function(sync) {
var self = this;
setTimeout(function(){
if(!queueEmpty){
self.emit([queue.shift()]);
queueEmpty =
( queue.length === 0
? true
: false )
}
sync();
}, 100);
})
.declareOutputFields(['trade'])
})()

Balancing slow I/O in a fast stream read stream

In node.js I have a read stream that I wish to reformat and write to a database. As the read stream is fast and the write is slow the node.js queue could be overwhelmed as the queue of writes builds up (assume the stream is gb's of data). How do I force the read to wait for the write part of the code so this does not happen without blocking ?
var request = http.get({
host: 'api.geonames.org',
port: 80,
path: '/children?' + qs.stringify({
geonameId: geonameId,
username: "demo"
})
}).on('response', function(response) {
response.setEncoding('utf8');
var xml = new XmlStream(response, 'utf8');
xml.on('endElement: geoname ', function(input) {
console.log('geoname');
var output = new Object();
output.Name = input.name;
output.lat = input.lat;
output.lng = input.lng;
output._key = input.geonameId;
data.db.document.create(output, data.doc, function(callback){
//this is really slow.
}
// i do not want to return from here and receive more data until the 'create' above has completed
});
});
I just ran into this problem last night, and in my hackathon induced sleep deprived state, here is how I solved it:
I would increment a counter whenever I sent a job out to be processed, and decremented the counter when the operation completed. To keep the outbound traffic from overwhelming the other service, I would pause the stream when there was a certain number of pending outbound requests. The code is very similar to the following.
var instream = fs.createReadStream('./combined.csv');
var outstream = new stream;
var inProcess = 0;
var paused = false;
var rl = readline.createInterface(instream, outstream);
rl.on('line', function(line) {
inProcess++;
if(inProcess > 100) {
console.log('pausing input to clear queue');
rl.pause();
paused = true;
}
someService.doSomethingSlow(line, function() {
inProcess--;
if(paused && inProcess < 10) {
console.log('resuming stream');
paused = false;
rl.resume();
}
if (err) throw err;
});
});
rl.on('end', function() {
rl.close();
});
Not the most elegant solution, but it worked and allowed me to process the million+ lines without running out of memory or throttling the other service.
My solution simply extends an empty stream.Writable and is fundamentally identical to #Timothy's, but uses events and
doesn't rely on Streams1 .pause() and .resume() (which didn't seem to be having any effect on my data pipeline,
anyway).
var stream = require("stream");
var liveRequests = 0;
var maxLiveRequests = 100;
var streamPaused = false;
var requestClient = new stream.Writable();
function requestCompleted(){
liveRequests--;
if(streamPaused && liveRequests < maxLiveRequests){
streamPaused = false;
requestClient.emit("resumeStream");
}
}
requestClient._write = function (data, enc, next){
makeRequest(data, requestCompleted);
liveRequests++;
if(liveRequests >= maxLiveRequests){
streamPaused = true;
requestClient.once("resumeStream", function resume(){
next();
});
}
else {
next();
}
};
A counter, liveRequests, keeps track of the number of concurrent requests, and is incremented whenever
makeRequest() is called and decremented when it completes (ie, when requestCompleted()) is called. If a request has
just been made and liveRequests exceeds maxLiveRequests, we pause the stream with streamPaused. If a request
completes, the stream is paused, and liveRequests is now less than maxLiveRequests, we can resume the stream. Since
subsequent data items are read by _write() when its next() callback is called, we can simply defer the latter with
an event-listener on our custom "resumeStream" event, which mimics pausing/resuming.
Now, simply readStream.pipe(requestClient).
Edit: I abstracted this solution, along with automatic batching of input data, in a package.

Resources