Consider the following scenario. I have two Node Transform streams:
Transform stream 1
function T1(options) {
if (! (this instanceof T1)) {
return new T1(options);
}
Transform.call(this, options);
}
util.inherits(T1, Transform);
T1.prototype._transform = function(chunk, encoding, done) {
console.log("### Transforming in t1");
this.push(chunk);
done();
};
T1.prototype._flush = function(done) {
console.log("### Done in t1");
done();
};
Transform stream 2
function T2(options) {
if (! (this instanceof T2)) {
return new T2(options);
}
Transform.call(this, options);
}
util.inherits(T2, Transform);
T2.prototype._transform = function(chunk, encoding, done) {
console.log("### Transforming in t2");
this.push(chunk);
done();
};
T2.prototype._flush = function(done) {
console.log("### Done in t2");
done();
};
And, I'm wanting to apply these transform streams before returning a response. I have a simple HTTP server, and on each request, I fetch a resource and would like these transformations to be applied to this fetched resource and then send the result of the second transformation to the original response:
var options = require('url').parse('http://localhost:1234/data.json');
options.method = 'GET';
http.createServer(function(req, res) {
var req = http.request(options, function(httpRes) {
var t1 = new T1({});
var t2 = new T2({});
httpRes
.pipe(t1)
.pipe(t2)
.on('finish', function() {
// Do other stuff in here before sending request back
t2.pipe(res, { end : true });
});
});
req.end();
}).listen(3001);
Ultimately, the finish event never gets called, and the request hangs and times out because the response is never resolved. I've noticed that if I just pipe t2 into res, it seems to work fine:
.pipe(t1)
.pipe(t2)
.pipe(res, { end : true });
But, this scenario doesn't seem feasible because I need to do some extra work before returning the response.
This happens because you need to let node know that the stream is being consumed somewhere, otherwise the last stream will just fill up the buffer and considering your data is longer than the highwaterMark option (usually 16) and then halt waiting for the data to be consumed.
There are three ways of consuming a stream in full:
piping to a readable stream (what you did in the second part of your question)
reading consecutive chunks by calling the read method of a stream
listening on "data" events (essentially stream.on("data", someFunc)).
The last option is the quickest, but will result in consuming the stream without looking at memory usage.
I'd also note that using the "finish" event might be a little misleading, since it is called when the last data is read, but not necessarily emitted. On a Transform stream, since it's a readable as well it's much better to use the "end" event.
Related
I have a data stream (via Node EventEmitter) emitting data in JSON format and would like to save the stream into Cassandra as it gets emitted. Is there an elegant way to implement this functionality?
The driver that i'm using is nodejs-dse-driver and the Cassandra version is 3.11.1. Please suggest if there are any recommended plugins that i can leverage to accomplish the above task.
This is a good use case for a Transform Stream.
If you have a true Readable stream then you can pipe any Transform stream into the Readable stream. I don't think an event emitter is a readable stream though, so you may need to change your original data fetching implementation.
See the NodeJS documentation for implementation details.
https://nodejs.org/api/stream.html#stream_new_stream_transform_options
Something like this depending on your version of NodeJS.
const myTransformStream = new Transform({
objectMode: true,
transform(row, encoding, callback) {
// insert into Cassandra code here
cassandra.execute(query, row, {prepare: true}, () => {
// after the execute is done, callback to process more
callback(null, row);
});
}
});
originalStream.pipe(myTransformStream);
You can read the data in chunks from your source and send it in parallel, for example (using the async library):
const limit = 10;
stream.on('readable', () => {
let r;
const rows = [];
async.whilst(function condition() {
while ((r = csv.read()) != null && rows.length < limit) {
rows.push(r);
}
return rows.length > 0;
}, function eachGroup(next) {
// we have a group of 10 rows or less to save
// we can do it in a batch
// or we can do it in parallel with async.each()
async.each(rows, (r, eachCallback) {
// Adapt the row to parameters
// For example: sample
const params = r.split(',);
client.execute(query, params, { prepare: true}, eachCallback);
}, next);
}, function groupFinished(err) {
if (err) {
// something happened when saving
// TODO: do something with err
return;
}
// This chunk of rows emitted by stream where saved
});
}).on('end', () => {
// no more data from source
});
I have Node App that collects vote submissions and stores them in Cassandra. The votes are stored as base64 encoded encrypted strings. The API has an endpoint called /export that should get all of these votes strings (possibly > 1 million), convert them to binary and append them one after the other in a votes.egd file. That file should then be zipped and sent to the client. My idea is to stream the rows from Cassandra, converting each vote string to binary and writing to a WriteStream.
I want to wrap this functionality in a Promise for easy use. I have the following:
streamVotesToFile(query, validVotesFileBasename) {
return new Promise((resolve, reject) => {
const writeStream = fs.createWriteStream(`${validVotesFileBasename}.egd`);
writeStream.on('error', (err) => {
logger.error(`Writestream ${validVotesFileBasename}.egd error`);
reject(err);
});
writeStream.on('drain', () => {
logger.info(`Writestream ${validVotesFileBasename}.egd error`);
})
db.client.stream(query)
.on('readable', function() {
let row = this.read();
while (row) {
const envelope = new Buffer(row.vote, 'base64');
if(!writeStream.write(envelope + '\n')) {
logger.error(`Couldn't write vote`);
}
row = this.read()
}
})
.on('end', () => { // No more rows from Cassandra
writeStream.end();
writeStream.on('finish', () => {
logger.info(`Stream done writing`);
resolve();
});
})
.on('error', (err) => { // err is a response error from Cassandra
reject(err);
});
});
}
When I run this it is appending all the votes to a file and downloading fine. But there are a bunch of problems/questions I have:
If I make a req to the /export endpoint and this function runs, while it's running all other requests to the app are extremely slow or just don't finish before the export request is done. I'm guessing because the event loop being hogged by all of these events from the Cassandra stream (thousands per second) ?
All the votes seem to write to the file fine yet I get false for almost every writeStream.write() call and see the corresponding logged message (see code) ?
I understand that I need to consider backpressure and the 'drain' event for the WritableStream so ideally I would use pipe() and pipe the votes to a file because that has built in backpressure support (right?) but since I need to process each row (convert to binary and possible add other data from other row fields in the future), how would I do that with pipe?
This the perfect use case for a TransformStream:
const myTransform = new Transform({
readableObjectMode: true,
transform(row, encoding, callback) {
// Transform the row into something else
const item = new Buffer(row['vote'], 'base64');
callback(null, item);
}
});
client.stream(query, params, { prepare: true })
.pipe(myTransform)
.pipe(fileStream);
See more information on how to implement a TransformStream in the Node.js API Docs.
I have a nodejs proxy for calling a service. On the response, the request is piped to the service url (I guess that's the right way to do it if you want to parse the response before returning it). The problem is that the parser sometimes fails on JSON.parse(data) because it Unexpected end of input. From what I saw while debugging the issue is that the data being parsed is not complete (even though the service returns it properly).
I don't have too much experience with pipe and stream so I'm not sure why this is failing sometimes.
//Request setup
r.on('response', function(resp) {
if (resp.statusCode === 200) {
r.pipe(responseParser(config.get('service:url'))).pipe(res);
} else {
r.pipe(res);
}
});
//Parser module
var _ = require('lodash'),
stream = require('stream');
module.exports = function responseParser(url) {
var data = '',
parser = new stream.Transform({
objectMode: true
});
parser._transform = function (chunk, encoding, done) {
data += chunk.toString();
done();
};
parser._flush = function (done) {
if (data) {
var obj = mapValues(JSON.parse(data));
this.push(JSON.stringify(obj));
}
done();
};
function mapValues(data){
...
}
return parser;
}
I still don't know why sometimes the flush gets called before all the chunks of data are returned but what I did in order to avoid that is just to parse the chunks as they arrived, by making sure that in a chunk I don't get partial data on the values I needed to map. If a chunk contains only partial information for the targeted value, I remove it, and add it at the beginning of the next chunk. This way the data is parsed as it comes in so I don't have to rely on the fact that flush is called only when all the data has returned.
I would disable objectMode as it's not necessary in this case. Also, you'll want to wrap the JSON parsing in a try-catch in case of malformed input:
module.exports = function responseParser(url) {
var data = '';
var parser = new stream.Transform();
parser._transform = function(chunk, encoding, done) {
data += chunk;
done();
};
parser._flush = function(done) {
var err;
if (data) {
try {
var obj = mapValues(JSON.parse(data));
this.push(JSON.stringify(obj));
this.push(null);
} catch (ex) {
err = ex;
}
}
done(err);
};
function mapValues(data){
// ...
}
return parser;
};
You may also want to check that resp.headers['content-type'] contains application/json first before trying to parse it as such and you may want to make a custom Transform subclass and instantiate that instead of creating new _transform() and _flush() functions every time.
Rather than writing this yourself, why don't you use a streaming JSON parser that knows how to parse a stream? JSONStream for example.
The other option to make your life easier would be to use stream-to-promise to just convert the read stream into a Promise that will resolve to a Buffer of the JSON, which you can then parse.
Also, why is your proxy parsing the JSON?
How to convert stream into buffer in nodejs? Here is my code to parse a file in post request in express.
app.post('/upload', express.multipart({
defer: true
}), function(req, res) {
req.form.on('part', function(part) {
//Here I want to convert the streaming part into a buffer.
//do something buffer-specific task
var out = fs.createWriteStream('image/' + part.filename);
part.pipe(out);
});
req.form.on('close', function() {
res.send('uploaded!');
});
});
Instead of piping, you can attach readable and end event handlers to the part stream to read it:
var buffers = [];
part.on('readable', function(buffer) {
for (;;) {
let buffer = part.read();
if (!buffer) { break; }
buffers.push(buffer);
}
});
part.on('end', function() {
var buffer = Buffer.concat(buffers);
...do your stuff...
// write to file:
fs.writeFile('image/' + part.filename, buffer, function(err) {
// handle error, return response, etc...
});
});
Note: If you instead use data, it will read the entire upload into memory.
You could also create a custom transform stream to transform the incoming data, but that might not be trivial.
You can use the stream-to module, which can convert a readable stream's data into an array or a buffer:
var streamTo = require('stream-to');
req.form.on('part', function (part) {
streamTo.buffer(part, function (err, buffer) {
// Insert your business logic here
});
});
If you want a better understanding of what's happening behind the scenes, you can implement the logic yourself, using a Writable stream. As a writable stream implementor, you only have to define one function: the _write method, that will be called every time some data is written to the stream. When the input stream is finished emitting data, the end event will be emitted: we'll then create a buffer using the Buffer.concat method.
var stream = require('stream');
var converter = new stream.Writable();
// We'll store all the data inside this array
converter.data = [];
converter._write = function (chunk) {
converter.data.push(chunk);
};
// Will be emitted when the input stream has ended,
// i.e. no more data will be provided
converter.on('finish', function() {
// Create a buffer from all the received chunks
var b = Buffer.concat(this.data);
// Insert your business logic here
});
I'm relatively new to Node and Express.js. I'm trying to create a websocket server to push CSV data in irregular intervals stored in the file itself, line after line.
The CSV structure is something like this:
[timeout [ms], data1, data2, data3 ...]
I've successfully created a websocket server which communicates with the client.
I'm looking for a best solution to effectively do something like this:
1. Read a line of the CSV file
2. Send a line with WebSockets
3. Pause the reading for a period of time stored in the first value of the row
4. Resume the reading after the interval has passed, and back to step 1.
So far, I got this far (please feel free to trash my code completely as it might be very wrong - as I said, I'm new to it. It seems like the pause() doesn't do anything.
var $ = require('jquery')
,csv = require('csv');
exports.index = function(server){
var io = require('socket.io').listen(server);
io.sockets.on('connection', function (socket) {
socket.on('startTransmission', function(msg) {
csv()
.from.path('C:/dev/node_express/csv/test.csv', { delimiter: ',', escape: '"' })
.on('record', function(row,index){
var rowArray = $.parseJSON(JSON.stringify(row));
var json = {},
that = this;
$.each(rowArray, function(i,value){
json[keys[i]] = value;
});
socket.emit('transmitDataData', json);
//this.pause(); //I guess around here is where I'd like to pause
// setTimeout(function(){
// that.resume(); //and resume here after the timeout, stored in the first value (rowArray[0])
// }, rowArray[0]);
});
});
});
};
The commented out code unfortunately does not work - All data is sent immediately, row after row, the function doesn't pause
I ran into the same sort of thing with another use case. The issue is that calling pause() on the stream pauses the underlying stream reading but not the csv record parsing, so the record event can get called with the remainder of the records that made up the last read stream chunk. I synchronized them, in my case, like this:
var rows=0, actions=0;
stream.on('record', function(row, index){
rows++;
// pause here, but expect more record events until the raw read stream is exhausted
stream.pause();
runner.do(row, function(err, result) {
// when actions have caught up to rows read, read more rows.
if (actions==rows) {
stream.resume();
}
});
});
In your case, I'd buffer the rows and release them with the timer. Here's an untested re-factoring just to give you an idea of what I mean:
var $ = require('jquery'),
csv = require('csv');
exports.index = function(server){
var io = require('socket.io').listen(server);
io.sockets.on('connection', function (socket) {
socket.on('startTransmission', function(msg) {
var timer=null, buffered=[], stream=csv().from.path('C:/dev/node_express/csv/test.csv', { delimiter: ',', escape: '"' });
function transmit(row) {
socket.emit('transmitDataData', row);
}
function drain(timeout) {
if (!timer) {
timer = setTimeout(function() {
timer = null;
if (buffered.length<=1) { // get more rows ahead of time so we don't run out. otherwise, we could skip a beat.
stream.resume(); // get more rows
} else {
var row = buffered.shift();
transmit(row);
drain(row[0]);
}
}, timeout);
}
}
stream.on('record', function(row,index){
stream.pause();
if (index == 0) {
transmit(row);
} else {
buffered.push(row);
}
drain(row[0]); // assuming row[0] contains a timeout value.
});
stream.on('end', function() {
// no more rows. wait for buffer to empty, then cleanup.
});
stream.on('error', function() {
// handle error.
});
});
};