How do I output a stream of tuples from a Storm spout with emit() and sync()? - node.js

(xpost github issue)
I'm new to Storm. I found the helpful node-storm library and I have successfully submitted topologies, but I can't get my spout to emit a stream of tuples.
node-storm's wordcount example works fine.
I want a spout that subscribes to a websocket and outputs any messages as tuples.
Here's my attempt so far. I think I have some misconfiguration, because I know I my wsEmitter is emitting future events, but my Storm UI shows zero spout emits.
I suspect that maybe I shouldn't be binding the listener inside the spout function?
Does this function get invoked multiple times? (looks like it... see https://github.com/RallySoftware/node-storm/blob/master/lib/spout.js#L4 )
What does sync actually do and when should I use it?
var storm = require('node-storm');
var wsEmitter = require('./wsEmitter.js')();
wsEmitter.init(); // subscribe to websocket
var futuresSpout = storm.spout(function(sync) {
var self = this;
console.log('subscribing to ws');
wsEmitter.on('future', function(data){ // websocket data arrived
self.emit([data]);
sync();
});
})
.declareOutputFields(["a"]);

Turns out I had two problems. First, my topology wasn't executing because one of my bolts (not shown) failed to set .declareOutputFields().
Second, I need to delay the emits from the spout until the supervisor asks for one emit with nextTick(). I did that by buffering any incoming messages until the supervisor calls the spout:
module.exports = (function(){
var storm = require('node-storm');
var wsEmitter = require('./wsEmitter.js')();
wsEmitter.init();
var queue = [];
var queueEmpty = true;
wsEmitter.on('thing', function(data){
var trade = JSON.parse(data);
trade.timeReported = new Date().valueOf();
queue.push(trade);
queueEmpty = false;
});
return storm.spout(function(sync) {
var self = this;
setTimeout(function(){
if(!queueEmpty){
self.emit([queue.shift()]);
queueEmpty =
( queue.length === 0
? true
: false )
}
sync();
}, 100);
})
.declareOutputFields(['trade'])
})()

Related

Node kafka consumer to process messages in sequence

I have a kafka topic that I want to consume with a node app. The node app must process the messages from the topic in sequence, one by one, not many at the same time.
I tried this kind of code but this is not doing what I want. When there is messages in the topic waiting for processing and this code is started the on 'message' event gets triggered immediately for all the the messages. The first message gets mutex lock first but the rest of the messages are processed in random order.
var mutex = require( 'node-mutex' )();
var crypto = require('crypto');
var mutexToken = crypto.randomBytes(64).toString('hex');
var kafka = require('kafka-node');
var Consumer = kafka.Consumer;
var client = new kafka.Client('localhost:2181');
var consumer = new Consumer(
client,
[
{ topic: 'my_topic' }
]
);
consumer.on('message', function(message) {
console.log("new message")
mutex
.lock( mutexToken )
.then( function( unlock ) {
console.log(message);
unlock();
} );
});
Is it possible to consume the messages one by one, synchronously? Maybe with some other library?
I believe you can control the message offset directly by explicitly disabling the autoCommit feature.
Here is the link to the consumer documentation:
https://www.npmjs.com/package/kafka-node#highlevelconsumer
Here is a link to an example with autoCommit set off:
https://github.com/SOHU-Co/kafka-node/blob/master/example/consumer.js
https://github.com/SOHU-Co/kafka-node/blob/master/example/offset.js
I have never needed to disable the autoCommit feature, so I can't speak to the implementation.
From the test code it looks like it should be something along these lines:
var Offset = kafka.Offset;
var offset = new Offset(client);
var topics = [ { topic: EXISTS_TOPIC_2 } ];
var options = { autoCommit: false, groupId: '_groupId_1_test' };
var consumer = new Consumer(client, topics, options);
var count = 0;
consumer.on('error', noop);
consumer.on('offsetOutOfRange', function (topic) {
offsetOutOfRange(topic, this);
});
consumer.on('message', function (message) {
message.topic.should.equal(EXISTS_TOPIC_2);
message.value.should.equal('hello kafka');
message.partition.should.equal(0);
offset.commit('_groupId_1_test', [message], function (err) {
if (count++ === 0) done(err);
});
});
OK I looked at the API a little more and thought this might eb an angle worth investigating for you:
Consumer.prototype.pauseTopics = function (topics) {
if (!this.pausedPayloads) this.pausedPayloads = [];
pauseOrResume(this.payloads, this.pausedPayloads, topics);
};
Consumer.prototype.resumeTopics = function (topics) {
if (!this.pausedPayloads) this.pausedPayloads = [];
var reFetch = !this.payloads.length;
pauseOrResume(this.pausedPayloads, this.payloads, topics);
reFetch = reFetch && this.payloads.length;
if (reFetch) this.fetch();
};
from the documentation:
pause()
Pause the consumer. Calling pause does not automatically stop messages
from being emitted. This is because pause just stops the kafka
consumer fetch loop. Each iteration of the fetch loop can obtain a
batch of messages (limited by fetchMaxBytes).
So if you only fetch one message (perhaps your bytes are small enough that a max fetch is only 1), then pause will stop the next fetch from happening. But if you fetched multiple messages, pause will not prevent more than one message being emitted.
I think to be 100% certain you would need to write the logic to handle the messages synchronously. Perhaps emit messages into a queue, and process off of the queue?

ExpressJS and matching delayed responses with original requests

I am looking for a design pattern than can match a stream of answers with the original request and response objects.
Suppose I receive a web request for all dog pictures. I want to submit that request to a message queue so that a worker process can eventually handle it. When a worker machine grabs the dog picture request, it performs the work and submits the response to an answer queue which is being monitored by Express. As I process the incoming queue, I want to match up the dog picture response with the original request and response objects so I can return the dog list or process it further.
Two solutions occur to me, but each seems inelegant. I could keep a global reference to the original context, find it, then delete it from the global list.
Or I could create a subscription to the response queue and look for my answer among all the answers. This would work, but is brutally inefficient and its complexity rises geometrically. (10x10, 100x100, 1000x1000)
var express = require('express');
var app = express();
app.get('/doglist.txt', function(req, res){
putReqIntoQueue(req,res,"dogs");
});
var theRequests ={};
var i = 0;
var giveUpSecs = 60;
var putReqIntoQueue = function(req,res,payload) {
var index = 'index_'+i;
i++
var obj = {req:req,res:res,payload:payload,index:index}
theReqests[index] = obj;
var timeoutId = setTimeout(function(theIndex) {
theRequest[theIndex].res.send('timeout error');
delete theRequest[theIndex];
}(index),giveUpSecs*1000);
// insertIntoQueue(index,payload,timeoutId)
}
var onNewQueueResponse = function(index,payload,answer,timeoutId) {
clearTimeout(timeoutId);
if (index in theRequests) {
var obj = theRequests[index];
obj.res.send(payload);
delete theRequests[index];
} else {
// must have already timed out
}
}
// Queue("onNewMessage",onNewQueueResponse)
app.listen(3000);
console.log('Listening on port 3000');
This answer assumes some kind of queue that accepts work (insertIntoQueue) and then returns data when it is done through "onNewMessage" event. It times out after 60 seconds.

Balancing slow I/O in a fast stream read stream

In node.js I have a read stream that I wish to reformat and write to a database. As the read stream is fast and the write is slow the node.js queue could be overwhelmed as the queue of writes builds up (assume the stream is gb's of data). How do I force the read to wait for the write part of the code so this does not happen without blocking ?
var request = http.get({
host: 'api.geonames.org',
port: 80,
path: '/children?' + qs.stringify({
geonameId: geonameId,
username: "demo"
})
}).on('response', function(response) {
response.setEncoding('utf8');
var xml = new XmlStream(response, 'utf8');
xml.on('endElement: geoname ', function(input) {
console.log('geoname');
var output = new Object();
output.Name = input.name;
output.lat = input.lat;
output.lng = input.lng;
output._key = input.geonameId;
data.db.document.create(output, data.doc, function(callback){
//this is really slow.
}
// i do not want to return from here and receive more data until the 'create' above has completed
});
});
I just ran into this problem last night, and in my hackathon induced sleep deprived state, here is how I solved it:
I would increment a counter whenever I sent a job out to be processed, and decremented the counter when the operation completed. To keep the outbound traffic from overwhelming the other service, I would pause the stream when there was a certain number of pending outbound requests. The code is very similar to the following.
var instream = fs.createReadStream('./combined.csv');
var outstream = new stream;
var inProcess = 0;
var paused = false;
var rl = readline.createInterface(instream, outstream);
rl.on('line', function(line) {
inProcess++;
if(inProcess > 100) {
console.log('pausing input to clear queue');
rl.pause();
paused = true;
}
someService.doSomethingSlow(line, function() {
inProcess--;
if(paused && inProcess < 10) {
console.log('resuming stream');
paused = false;
rl.resume();
}
if (err) throw err;
});
});
rl.on('end', function() {
rl.close();
});
Not the most elegant solution, but it worked and allowed me to process the million+ lines without running out of memory or throttling the other service.
My solution simply extends an empty stream.Writable and is fundamentally identical to #Timothy's, but uses events and
doesn't rely on Streams1 .pause() and .resume() (which didn't seem to be having any effect on my data pipeline,
anyway).
var stream = require("stream");
var liveRequests = 0;
var maxLiveRequests = 100;
var streamPaused = false;
var requestClient = new stream.Writable();
function requestCompleted(){
liveRequests--;
if(streamPaused && liveRequests < maxLiveRequests){
streamPaused = false;
requestClient.emit("resumeStream");
}
}
requestClient._write = function (data, enc, next){
makeRequest(data, requestCompleted);
liveRequests++;
if(liveRequests >= maxLiveRequests){
streamPaused = true;
requestClient.once("resumeStream", function resume(){
next();
});
}
else {
next();
}
};
A counter, liveRequests, keeps track of the number of concurrent requests, and is incremented whenever
makeRequest() is called and decremented when it completes (ie, when requestCompleted()) is called. If a request has
just been made and liveRequests exceeds maxLiveRequests, we pause the stream with streamPaused. If a request
completes, the stream is paused, and liveRequests is now less than maxLiveRequests, we can resume the stream. Since
subsequent data items are read by _write() when its next() callback is called, we can simply defer the latter with
an event-listener on our custom "resumeStream" event, which mimics pausing/resuming.
Now, simply readStream.pipe(requestClient).
Edit: I abstracted this solution, along with automatic batching of input data, in a package.

What should I be using? Socket.io rooms or Redis pub-sub?

Pretty simple question. I am building a realtime game using nodejs as my backend and I am wondering if there is any information available on which one is more reliable and which one is more efficient?
I am heavily using both Redis and Socket.io throughout my code. So I want to know whether I should be utilizing Socket.io's Rooms or I would be better off using redis' pub-sub ?
Update:
Just realized there is a very important reason why you may want to use redis pub/sub over socket.io rooms. With Socket.io rooms when you publish to listeners, the (browser)clients recieve the message, with redis it is actually the (redis~on server)clients who recieve messages. For this reason, if you want to inform all (server)clients of information specific to each client and maybe do some processing before passing on to browser clients, you are better off using redis. Using redis you can just fire off an event to generate each users individual data, where as with socket.io you have to actually generate all the users unique data at once, then loop through them and send them their individual data, which almost defeats the purpose of rooms, at least for me.
Unfortunately for my purposes I am stuck with redis for now.
Update 2: Ended up developing a plugin to use only 2 redis connections but still allow for individual client processing, see answer below....
Redis pub/sub is great in case all clients have direct access to redis. If you have multiple node servers, one can push a message to the others.
But if you also have clients in the browser, you need something else to push data from a server to a client, and in this case, socket.io is great.
Now, if you use socket.io with the Redis store, socket.io will use Redis pub/sub under the hood to propagate messages between servers, and servers will propagate messages to clients.
So using socket.io rooms with socket.io configured with the Redis store is probably the simplest for you.
I ended up writing a node plugin to allow for many pub-sub clients but only require 2 redis connections instead of a new one on every single socketio connection, it should work in general, figured someone else may find use for it.
This code assumed you have socket.io running and setup, basically in this example any number of socket.io clients can connect and it will always still only use 2 redis connections, but all clients can subscribe to their own channels. In this example, all clients get a message 'sweet message!' after 10 seconds.
Example with socket.io (utilizing redis pub-sub):
var
RPubSubFactory = require('rpss.js');
var
redOne = redis.createClient(port, host),
redTwo = redis.createClient(port, host);
var pSCFactory = new RPubSubFactory(redOne);
io.sockets.on('connection', function(socket){
var cps = pSCFactory.createClient();
cps.onMessage(function(channel, message){
socket.emit('message', message);
});
io.sockets.on('disconnect', function(socket){
// Dont actually need to unsub, because end() will cleanup all subs,
// but if you need to sometime during the connection lifetime, you can.
cps.unsubscribe('cool_channel');
cps.end();
});
cps.subscribe('cool_channel')
});
setTimeout(function(){
redTwo.publish('cool_channel', 'sweet message!');
},10000);
Actual plugin code:
var RPubSubFactory = function(){
var
len,indx,tarr;
var
dbcom = false,
rPubSubIdCounter = 1,
clientLookup = {},
globalSubscriptions = {};
// public
this.createClient = function()
{
return new RPubSupClient();
}
// private
var constructor = function(tdbcom)
{
dbcom = tdbcom;
dbcom.on("message", incommingMessage);
}
var incommingMessage = function(rawchannel, strMessage)
{
len = globalSubscriptions[rawchannel].length;
for(var i=0;i<len;i++){
//console.log(globalSubscriptions[rawchannel][i]+' incomming on channel '+rawchannel);
clientLookup[globalSubscriptions[rawchannel][i]]._incommingMessage(rawchannel, strMessage);
}
}
// class
var RPubSupClient = function()
{
var
id = -1,
localSubscriptions = [];
this.id = -1;
this._incommingMessage = function(){};
this.subscribe = function(channel)
{
//console.log('client '+id+' subscribing to '+channel);
if(!(channel in globalSubscriptions)){
globalSubscriptions[channel] = [id];
dbcom.subscribe(channel);
}
else if(globalSubscriptions[channel].indexOf(id) == -1){
globalSubscriptions[channel].push(id);
}
if(localSubscriptions.indexOf(channel) == -1){
localSubscriptions.push(channel);
}
}
this.unsubscribe = function(channel)
{
//console.log('client '+id+' unsubscribing to '+channel);
if(channel in globalSubscriptions)
{
indx = globalSubscriptions[channel].indexOf(id);
if(indx != -1){
globalSubscriptions[channel].splice(indx, 1);
if(globalSubscriptions[channel].length == 0){
delete globalSubscriptions[channel];
dbcom.unsubscribe(channel);
}
}
}
indx = localSubscriptions.indexOf(channel);
if(indx != -1){
localSubscriptions.splice(indx, 1);
}
}
this.onMessage = function(msgFn)
{
this._incommingMessage = msgFn;
}
this.end = function()
{
//console.log('end client id = '+id+' closing subscriptions='+localSubscriptions.join(','));
tarr = localSubscriptions.slice(0);
len = tarr.length;
for(var i=0;i<len;i++){
this.unsubscribe(tarr[i]);
}
localSubscriptions = [];
delete clientLookup[id];
}
var constructor = function(){
this.id = id = rPubSubIdCounter++;
clientLookup[id] = this;
//console.log('new client id = '+id);
}
constructor.apply(this, arguments);
}
constructor.apply(this, arguments);
};
module.exports = RPubSubFactory;
I mucked around and tried to improve the efficiency as much as I could, but after doing some different speed tests, I concluded this was the fastest I could get it.
For up-to-date version: https://github.com/Jezternz/node-redis-pubsub

flapjax get last element from eventstream

I am trying to implement a small chatservice using flapjax. I use an eventStream to get all the clients that connect to the server, and when broadcasting a message (the function on 'message') I map over this eventStream with the function that emits the message to the current client.
// Event stream yielding received clients
var clientReceiverE = receiverE();
// Event stream collecting all the clients
var clientsE = clientReceiverE.collectE([], function (client, clients) {return clients.concat([client]);});
socket.on('connection', function(client) {
clientReceiverE.sendEvent(client);
for (i = 0; i < chatMessages.length; i++) {
client.emit('message', chatMessages[i]);
}
client.on('message', function(message) {
chatMessages.push(message);
//for (i = 0; i < clients.length; i++) {
// clients[i].emit('message', message);
//}
mapE(clientReceiverE, function(client) {console.log(client); client.emit('message', message); return client});
});
client.on('nickname', function(name) {
});
});
The registring of the clients on the eventstream succeeds with this code, but the mapE doesn't result in a loop over all this clients. Does anybody know what is wrong here?
If you are still not guessed :) I think it's because mapE doesn't produce any action itself, mapE only creates and returns another EventStream which behaves like a given source, but with modified value by means of a given function.
You should not be using mapE like that. In your code you are attempting to recreate the mapE event bindings with each client.on('message', ...).
This problem is solved using a receiverE. This function is used to translate, external event streams into flapjax EventStream 's.
// Event stream yielding received clients
var clientReceiverE = receiverE();
// Event stream collecting all the clients
var clientsE = clientReceiverE.collectE([], function (client, clients) {return clients.concat([client]);});
var clientsB = clientsE.startsWith(undefined); //Turn the event stream into a behaviour (event + value)
var messagesE = receiverE();
messagesE.mapE(function(messagePacket){
var clients = clientsB.valueNow(); //Grab current value of client list behaviour
if(clients==undefined){
return;
}
var from = messagePacket.client;
for(var index in clients){
clients[index].emit('message', messagePacket.message);
console.log(messagePacket.message);
}
});
socket.on('connection', function(client) {
clientReceiverE.sendEvent(client);
client.on('message', function(message) {
messagesE.sendEvent({client: client, message: message});
});
});
The difference is this. The flapjax tree is isolated from the WebSocket event code and there is no shared state between them.

Resources