EventEmitter in the middle of a chain of Promises - node.js

I am doing something that involves running a sequence of child_process.spawn() in order (to do some setup, then run the actual meaty command that the caller is interested in, then do some cleanup).
Something like:
doAllTheThings()
.then(function(exitStatus){
// all the things were done
// and we've returned the exitStatus of
// a command in the middle of a chain
});
Where doAllTheThings() is something like:
function doAllTheThings() {
runSetupCommand()
.then(function(){
return runInterestingCommand();
})
.then(function(exitStatus){
return runTearDownCommand(exitStatus); // pass exitStatus along to return to caller
});
}
Internally I'm using child_process.spawn(), which returns an EventEmitter and I'm effectively returning the result of the close event from runInterestingCommand() back to the caller.
Now I need to also send data events from stdout and stderr to the caller, which are also from EventEmitters. Is there a way to make this work with (Bluebird) Promises, or are they just getting in the way of EventEmitters that emit more than one event?
Ideally I'd like to be able to write:
doAllTheThings()
.on('stdout', function(data){
// process a chunk of received stdout data
})
.on('stderr', function(data){
// process a chunk of received stderr data
})
.then(function(exitStatus){
// all the things were done
// and we've returned the exitStatus of
// a command in the middle of a chain
});
The only way I can think to make my program work is to rewrite it to remove the promise chain and just use a raw EventEmitter inside something that wraps the setup/teardown, something like:
withTemporaryState(function(done){
var cmd = runInterestingCommand();
cmd.on('stdout', function(data){
// process a chunk of received stdout data
});
cmd.on('stderr', function(data){
// process a chunk of received stderr data
});
cmd.on('close', function(exitStatus){
// process the exitStatus
done();
});
});
But then since EventEmitters are so common throughout Node.js, I can't help but think I should be able to make them work in Promise chains. Any clues?
Actually, one of the reasons I want to keep using Bluebird, is because I want to use the Cancellation features to allow the running command to be cancelled from the outside.

There are two approaches, one provides the syntax you originally asked for, the other takes delegates.
function doAllTheThings(){
var com = runInterestingCommand();
var p = new Promise(function(resolve, reject){
com.on("close", resolve);
com.on("error", reject);
});
p.on = function(){ com.on.apply(com, arguments); return p; };
return p;
}
Which would let you use your desired syntax:
doAllTheThings()
.on('stdout', function(data){
// process a chunk of received stdout data
})
.on('stderr', function(data){
// process a chunk of received stderr data
})
.then(function(exitStatus){
// all the things were done
// and we've returned the exitStatus of
// a command in the middle of a chain
});
However, IMO this is somewhat misleading and it might be desirable to pass the delegates in:
function doAllTheThings(onData, onErr){
var com = runInterestingCommand();
var p = new Promise(function(resolve, reject){
com.on("close", resolve);
com.on("error", reject);
});
com.on("stdout", onData).on("strerr", onErr);
return p;
}
Which would let you do:
doAllTheThings(function(data){
// process a chunk of received stdout data
}, function(data){
// process a chunk of received stderr data
})
.then(function(exitStatus){
// all the things were done
// and we've returned the exitStatus of
// a command in the middle of a chain
});

Related

Can an event based read function ever run out of order?

Given a situation where I use the nodejs readline library to iterate over each line in the STDIN stream, do some processing on it and write it back out to STDOUT as in the following example:
var rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
function my_function(line) {
var output = ...(line);
process.stdout.write(output);
}
rl.on('line', my_function);
I'm concerned that the processing I'm doing will take very different amounts of time depending on the line content so some lines will return very quickly while others takes some time to sort out. Is it possible that my_function() will ever run out of order and hence cause the output stream to be scrambled? Should I be looking into using a synchronous loop of some kind instead of this asynchronous event handler?
The JavaScript execution itself is single-threaded, so as long as you're only performing synchronous operations inside the event handler, there is no problem.
If you are performing asynchronous operations inside the event handler, then it is possible that another 'line' event could be emitted before your asynchronous operation(s) are complete. In that case, you would need to rl.pause() first and then rl.resume() once you are finished with your asynchronous operations. However, this isn't foolproof since 'line' events could still be emitted after a rl.pause() if the current chunk of data read from the input stream had multiple line breaks.
So if you are performing asynchronous operations inside the event handler, you are probably better off just reading from the stream yourself so that you have more control over the parsing behavior. This is actually pretty easy to do, for example:
function parseStream(stream, callback) {
// Assuming all stream data is text and not binary ...
var buffer = '';
var RE_EOL = /\r?\n/g;
stream.on('data', function(data) {
buffer += data;
processBuffer();
});
stream.on('end', callback);
stream.on('error', callback);
function processBuffer() {
var idx = RE_EOL.exec(buffer);
if (~idx) {
// Found a line ending
var line = buffer.slice(0, RE_EOL.index);
buffer = buffer.slice(RE_EOL.index + RE_EOL[0].length);
stream.pause();
callback(null, line, processBuffer);
} else {
stream.resume();
}
}
}
// ...
processStream(process.stdin, function(err, line, done) {
if (err) throw err;
if (line === undefined) {
// No more data will be available (stream ended)
console.log('(Stream ended!)');
return;
}
// Do something with `line`
console.log(line);
// Call `done()` whenever your async operation(s) are all finished
done();
});

EventEmitter memory leak detected: Proper way to pass CSV data to multiple modules?

I am dipping my toe into using different npm modules my own way whereas before I just executed already created gulpfiles. The npm module penthouse loads a webpage and determines the above the fold CSS for that page. I am trying to take that module and use it with a site crawler so I can get the above the fold css for all pages, and store that CSS in a table.
So essentially I am:
Crawling a site to get all the urls
capturing the page id from each url
storing pages & their id's in a CSV
load the CSV and pass each URL to penthouse
take penthouse output and store it in a table
So I am fine up until the last two steps. When I am reading the CSV, I get the error possible EventEmitter memory leak detected. 11 exit listeners added. Use emitter.setMaxListeners() to increase limit.
The stack trace points here at line 134. After reading about the error, it makes sense because I see a bunch of event listeners being added, but I don't see penthouse ever really executing and closing the event listeners.
It works just fine standalone as expected (Running penthouse against a single page then exiting). But when I execute the code below to try and loop through all URLs in a csv, it spits out the memory leak error twice, and just hangs. None of my console.log statements in the following script are executed.
However, I added console.log to the end of the penthouse index.js file, and it is executed multiple times (where it adds event listeners), but it never timeouts or exits.
So it's clear I am not integrating this properly, but not sure how to proceed. What would be the best way to force it to read one line in the CSV at a time, process the URL, then take the output and store it in the DB before moving onto the next line?
const fs = require('fs');
var csv = require('fast-csv');
var penthouse = require('penthouse'),
path = require('path');
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var csvStream = csv()
//returns single line from CSV
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
penthouse({
url : data[2],
css : './dist/styles/main.css'
}, function(err, criticalCss) {
if (err) {
console.log(err);
}
console.log('do we ever get here?'); //answer is no
if (data[1] === 'post') {
wp.posts().id( data[0] ).post({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved to db');
});
} else {
wp.pages().id( data[0] ).page({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved to db');
});
}
});
})
.on("end", function(){
console.log("done");
});
return stream.pipe(csvStream);
};
UPDATE
Changed my method to look like below so it processes all rows first, but still throws the same error. Writes "done" to the console, and immediately spits out the memory warning twice.
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var urls = [];
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done");
buildCriticalCss(urls);
});
return stream.pipe(csvStream);
};
var buildCriticalCss = function(urls) {
//console.log(urls);
urls.forEach(function(data, idx) {
//console.log(data);
penthouse({
url : data[2],
css : './dist/styles/main.css',
// OPTIONAL params
width : 1300, // viewport width
height : 900, // viewport height
timeout: 30000, // ms; abort critical css generation after this timeout
strict: false, // set to true to throw on css errors (will run faster if no errors)
maxEmbeddedBase64Length: 1000 // charaters; strip out inline base64 encoded resources larger than this
}, function(err, criticalCss) {
if (err) {
console.log(err);
}
console.log('do we ever finish one?');
if (data[1] === 'post') {
console.log('saving post ' + data[0]);
wp.posts().id( data[0] ).post({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved post to db');
});
} else {
console.log('saving page ' + data[0]);
wp.pages().id( data[0] ).page({
inline_css: criticalCss
}).then(function( response ) {
console.log('saved page to db');
});
}
});
});
};
Update 2
I took the simple approach to control the amount of concurrent processes spawned.
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
var urls = [];
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done");
//console.log(urls);
buildCriticalCss(urls);
});
return stream.pipe(csvStream);
};
function buildCriticalCss(data) {
var row = data.shift();
console.log(row);
penthouse({
url : row[2],
css : './dist/styles/main.css',
// OPTIONAL params
width : 1300, // viewport width
height : 900, // viewport height
timeout: 30000, // ms; abort critical css generation after this timeout
strict: false, // set to true to throw on css errors (will run faster if no errors)
maxEmbeddedBase64Length: 1000 // charaters; strip out inline base64 encoded resources larger than this
}, function(err, criticalCss) {
if (err) {
console.log('err');
}
// handle your criticalCSS
console.log('finished');
console.log(row[2]);
// now start next job, if we have more urls
if (data.length !== 0) {
buildCriticalCss(data);
}
});
}
The error message you're seeing is a default printed to the console by node's event library if more than the allowed number of event listeners are defined for an instance of EventEmitter. It does not indicate an actual memory leak. Rather it is displayed to make sure you're aware of the possibility of a leak.
You can see this by checking the event.EventEmitter source code at lines 20 and 244.
To stop EventEmitter from displaying this message and since penthouse does not expose its specific EventEmitter, you'll need to set the default allowed event emitter listeners to something larger than its default value of 10 using:
var EventEmitter=require('event').EventEmitter;
EventEmitter.defaultMaxListeners=20;
Note that according to Node's documentation for EventEmitter.defaultMaxListeners, this will change the maximum number of listeners for all instances of EventEmitter, including those that have already been defined previous to the change.
Or you could simply ignore the message.
Further to the hanging of your code, I'd advise gathering all the results from the parsing of your CSV into an array, and then processing the array contents separately from the parsing process.
This would accomplish two things: It would allow you to
be assured the entire CSV file was valid before you started processing, and
instrument debugging messages while processing each element, which would give you deeper insight into how each element of the array was processed.
UPDATE
As noted below, depending on how many URLs you're processing, you're probably overwhelming Node's ability to handle all of your requests in parallel.
One easy way to proceed would be to use eventing to marshall your processing so your URLs are processed sequentially, as in:
var assert=require('assert'),
event=require('events'),
fs=require('fs'),
csv=require('fast-csv');
penthouse=require('penthouse');
var emitter=new events.EventEmitter();
/** Container for URL records read from CSV file.
*
* #type {Array}
*/
var urls=[];
/** Reads urls from file and triggers processing
*
* #emits processUrl
*/
var readUrlCsv = function() {
var stream = fs.createReadStream("/home/vagrant/urls.csv");
stream.on('error',function(e){ // always handle errors!!
console.error('failed to createReadStream: %s',e);
process.exit(-1);
});
var csvStream = csv()
.on("data", function(data) {
// data[0]: table id, data[1]: page type, data[2]: url
urls.push(data);
})
.on("end", function(){
console.log("done reading csv");
//console.log(urls);
emitter.emit('processUrl'); // start processing URLs
})
.on('error',function(e){
console.error('failed to parse CSV: %s',e);
process.exit(-1);
});
// no return required since we don't do anything with the result
stream.pipe(csvStream);
};
/** Event handler to process a single URL
*
* #emits processUrl
*/
var onProcessUrl=function(){
// always check your assumptions
assert(Array.isArray(urls),'urls must be an array');
var urlRecord=urls.shift();
if(urlRecord){
assert(Array.isArray(urlRecord),'urlRecord must be an array');
assert(urlRecord.length>2,'urlRecord must have at least three elements');
penthouse(
{
// ...
},
function(e,criticalCss){
if(e){
console.error('failed to process record %s: %s',urlRecord,e);
return; // IMPORTANT! do not drop through to rest of func!
}
// do what you need with the result here
if(urls.length===0){ // ok, we're done
console.log('completed processing URLs');
return;
}
emitter.emit('processUrl');
}
);
}
}
/**
* processUrl event - triggers processing of next URL
*
* #event processUrl
*/
emitter.on('processUrl',onProcessUrl); // assign handler
// start everything going...
readUrlCsv();
The benefit of using events here rather than your solution is the lack of recursion which can easily overwhelm your stack.
Hint: You can use events to handle all program flow issues normally addressed by Promises or modules like async.
And since events are at the very heart of Node (the "event loop"), it's really the best, most efficient way to solve such problems.
It's both elegant and "The Node Way"!
Here is a gist that illustrates the technique, without relying on streams or penthouse, the output of which is:
url: url1
RESULT: RESULT FOR url1
url: url2
RESULT: RESULT FOR url2
url: url3
RESULT: RESULT FOR url3
completed processing URLs
Besides using console.logs which usually is enough, you can also use the built in debugger: https://nodejs.org/api/debugger.html
Another thing you can do is go into the node_modules/penthouse directory and add your console.logs or debugger statement into the code for that module. That way you can debug your program there rather than the module just being a black box.
Also make sure there isn't some kind of race condition where for example the CSV doesn't always get output before it tries to read them in.
I think that the memory leak issue is probably a red herring as far as making your code function.
From your comment it sounds like you want to do something like the following with async.mapSeries: http://promise-nuggets.github.io/articles/15-map-in-series.html You could also use promises as it shows or even after getting promises set up use the async/await stuff with a regular for loop after compiling with babel. In the long run I recommend doing that sort of thing with async/await and babel but that might be overkill just to get this working.

Promise resolving to child stream stdout and rejecting child stream stderr

I'd like to build a promise that spawns a child process using require('child_process').spawn. The process streams its output to stdout and its errors to stderr.
I would like the promise to:
reject(child.stderr stream (or its data)) if child.stderr emits any data.
resolve(child.stdout stream) only if no error is emitted.
I'm doing this because I want to chain the promise to:
a then that processes the child.stdout stream (upload the stream to an S3 bucket).
a catch that can process the child.stderr stream, allowing me to properly handle errors.
Is it feasible to combine promises and process streams like this ?
I was thinking of working around stderr but unsure about whats happening in between to stdout if a lot of data is coming into it and I don't process it fast enough.
As I see it, the issue is that you don't know whether you ever got data on stderr until the entire process is done as it could put data there at any time.
So, you have to wait for the entire process to be done before calling resolve() or reject(). And, if you then want the entire data to be sent to either one of those, you'd have to buffer them. You could call reject() as soon as you got data on stderr, but you aren't guaranteed to have all the data yet because it's a stream.
So, if you don't want to buffer, you're better off just letting the caller see the streams directly.
If you are OK with buffering the data, you can buffer it yourself like this:
Based on the spawn example in the node.js doc, you could add promise support to it like this:
const spawn = require('child_process').spawn;
function runIt(cmd, args) {
return new Promise(function(resolve, reject) {
const ls = spawn(cmd, args);
// Edit thomas.g: My child process generates binary data so I use buffers instead, see my comments inside the code
// Edit thomas.g: let stdoutData = new Buffer(0)
let stdoutData = "";
let stderrData= "";
ls.stdout.on('data', (data) => {
// Edit thomas.g: stdoutData = Buffer.concat([stdoutData, chunk]);
stdoutData += data;
});
ls.stderr.on('data', (data) => {
stderrData += data;
});
ls.on('close', (code) => {
if (stderrData){
reject(stderrData);
} else {
resolve(stdoutData);
}
});
ls.on('error', (err) => {
reject(err);
});
})
}
//usage
runIt('ls', ['-lh', '/usr']).then(function(stdoutData) {
// process stdout data here
}, function(err) {
// process stdError data here or error object (if some other type of error)
});

In this code, why using a closure?

I don't get why a closure is being used in the code below:
function writeData(socket, data){
var success = !socket.write(data);
if(!success){
(function(socket, data){
socket.once('drain', function(){
writeData(socket, data);
});
})(socket, data)
}
}
and why using var success=!socket.write(data); instead directly input.
May be socket.write is not a boolean?
The IIFE is unnecessary, you can rewrite the code to this:
function writeData(socket, data){
var success = ! socket.write(data);
if (! success) {
socket.once('drain', function() {
writeData(socket, data);
});
}
}
Or even this:
function writeData(socket, data){
var success = ! socket.write(data);
if (! success) {
socket.once('drain', writeData.bind(this, socket, data));
}
}
According to the documentation for socket.write(), the method
Sends data on the socket. The second parameter specifies the encoding
in the case of a string--it defaults to UTF8 encoding.
Returns true if the entire data was flushed successfully to the kernel
buffer. Returns false if all or part of the data was queued in user
memory. 'drain' will be emitted when the buffer is again free.
The optional callback parameter will be executed when the data is
finally written out - this may not be immediately.
In the code, if the first socket.write() is not able to flush all the data in one go, the closure waits for the socket drain event, in which case it will call writeData method again. This is a very ingenious way of creating an asynchronous recursive function, which will get called until success returns true.

How to correctly calculate the the number of bytes of a node.js stream that have been processed?

I have a stream I'm sending over the wire and takes a bit of time to fully send, so I want to display how far along it is on the fly. I know you can listen on the 'data' event for streams, but in newer versions of node, it also puts the stream into "flowing mode". I want to make sure i'm doing this correctly.
Currently I have the following stuff:
deploymentPackageStream.pause() // to prevent it from entering "flowing mode"
var bytesSent = 0
deploymentPackageStream.on('data', function(data) {
bytesSent+=data.length
process.stdout.write('\r ')
process.stdout.write('\r'+(bytesSent/1000)+'kb sent')
})
deploymentPackageStream.resume()
// copy over the deployment package
execute(conn, 'cat > deploymentPackage.sh', deploymentPackageStream).wait()
This gives me the right bytesSent output, but the resulting package seems to be missing some data off the front. If I put the 'resume' line after executing the copy line (the last line), it doesn't copy anything. If I don't resume, it also doesn't copy anything. What's going on and how do I do this properly without disrupting the stream and without entering flowing mode (I want back pressure)?
I should mention, i'm still using node v0.10.x
Alright, I made something that essentially is a passthrough, but calls a callback with data as it comes in:
// creates a stream that can view all the data in a stream and passes the data through
// parameters:
// stream - the stream to peek at
// callback - called when there's data sent from the passed stream
var StreamPeeker = exports.StreamPeeker = function(stream, callback) {
Readable.call(this)
this.stream = stream
stream.on('readable', function() {
var data = stream.read()
if(data !== null) {
if(!this.push(data)) stream.pause()
callback(data)
}
}.bind(this))
stream.on('end', function() {
this.push(null)
}.bind(this))
}
util.inherits(StreamPeeker, Readable)
StreamPeeker.prototype._read = function() {
this.stream.resume()
}
If I understand streams properly, this should appropriately handle backpressure.
Using this, I can just count up data.length in the callback like this:
var peeker = new StreamPeeker(stream, function(data) {
// use data.length
})
peeker.pipe(destination)

Resources