Parsing piped response sometimes gets chunked data - node.js

I have a nodejs proxy for calling a service. On the response, the request is piped to the service url (I guess that's the right way to do it if you want to parse the response before returning it). The problem is that the parser sometimes fails on JSON.parse(data) because it Unexpected end of input. From what I saw while debugging the issue is that the data being parsed is not complete (even though the service returns it properly).
I don't have too much experience with pipe and stream so I'm not sure why this is failing sometimes.
//Request setup
r.on('response', function(resp) {
if (resp.statusCode === 200) {
r.pipe(responseParser(config.get('service:url'))).pipe(res);
} else {
r.pipe(res);
}
});
//Parser module
var _ = require('lodash'),
stream = require('stream');
module.exports = function responseParser(url) {
var data = '',
parser = new stream.Transform({
objectMode: true
});
parser._transform = function (chunk, encoding, done) {
data += chunk.toString();
done();
};
parser._flush = function (done) {
if (data) {
var obj = mapValues(JSON.parse(data));
this.push(JSON.stringify(obj));
}
done();
};
function mapValues(data){
...
}
return parser;
}
I still don't know why sometimes the flush gets called before all the chunks of data are returned but what I did in order to avoid that is just to parse the chunks as they arrived, by making sure that in a chunk I don't get partial data on the values I needed to map. If a chunk contains only partial information for the targeted value, I remove it, and add it at the beginning of the next chunk. This way the data is parsed as it comes in so I don't have to rely on the fact that flush is called only when all the data has returned.

I would disable objectMode as it's not necessary in this case. Also, you'll want to wrap the JSON parsing in a try-catch in case of malformed input:
module.exports = function responseParser(url) {
var data = '';
var parser = new stream.Transform();
parser._transform = function(chunk, encoding, done) {
data += chunk;
done();
};
parser._flush = function(done) {
var err;
if (data) {
try {
var obj = mapValues(JSON.parse(data));
this.push(JSON.stringify(obj));
this.push(null);
} catch (ex) {
err = ex;
}
}
done(err);
};
function mapValues(data){
// ...
}
return parser;
};
You may also want to check that resp.headers['content-type'] contains application/json first before trying to parse it as such and you may want to make a custom Transform subclass and instantiate that instead of creating new _transform() and _flush() functions every time.

Rather than writing this yourself, why don't you use a streaming JSON parser that knows how to parse a stream? JSONStream for example.
The other option to make your life easier would be to use stream-to-promise to just convert the read stream into a Promise that will resolve to a Buffer of the JSON, which you can then parse.
Also, why is your proxy parsing the JSON?

Related

How to pipe data modified from a gunzip stream into a gzip stream?

I need to trigger, through an http request, a process where I download some data from S3, gunzip it, modify the stream, gzip it and send to another bucket in S3.
So far I was able to either:
Download
Gunzip
Modify (filter) the data
return the data
Or:
Download
Gunzip
Gzip
Upload the unmodified data and retrieve the url of the object
My first attempt at this consisted in using the on('data') event from the gunzip stream to modify the data; then when the 'end' event is thrown, I can return it to the browser making the request.
var accumulator = [];
gunzip.on('data', chunk=>{
var lines = chunk.toString('utf-8').split(\n);
lines.forEach(line=>{
if(shouldBeFiltered(line)){
accumulator.push(line);
}
})
})
gunzip.on('end', ()=>{
res.send(accumulator);
})
getS3.pipe(gunzip)
If instead of returning the result (res.send) I try to pipe gunzip to gzip, the filter is ignored. It makes sense as I have an accumulator array that I return (in the previous case) when the end event is thrown.
Then after some digging, I found a reference suggesting that the data should be pushed to, and I tried the following, which did not work:
gunzip.on('data', chunk=>{
var lines = chunk.toString('utf-8').split(\n);
lines.forEach(line=>{
if(shouldBeFiltered(line)){
gunzip.push(line);
}
})
})
// the end event no longer mattered
// gunzip.on('end', ()=>{
// res.send(accumulator);
// })
getS3.pipe(gunzip).pipe(gzip).pipe(putS3(putS3param.Key, putS3param.Bucket));
Then I tried to create a transform stream (this is extremely simplified as I was trying the concept), but then I had an internal error:
const stream = require('stream');
const Transform = stream.Transform;
function filter(pipeline) {
var the_filter = new Transform({
transform(chunk, encoding, next) {
console.log();
chunk += Buffer('Modified', 'utf-8');
this.push(chunk);
next();
}
});
pipeline.pipe(the_filter);
}
Other than creating a file and gziping it and uploading I have no more ideas.
Thanks for any help!
After much digging around, I finally found the answer in this page
It seems that what was missing as setting Transform as objectMode, other than that, I see nothing relevant.
var stream = require('stream')
var liner = new stream.Transform( { objectMode: true } )
liner._transform = function (chunk, encoding, done) {
var data = chunk.toString()
if (this._lastLineData) data = this._lastLineData + data
var lines = data.split('\n')
this._lastLineData = lines.splice(lines.length-1,1)[0]
lines.forEach(this.push.bind(this))
done()
}
liner._flush = function (done) {
if (this._lastLineData) this.push(this._lastLineData)
this._lastLineData = null
done()
}
module.exports = liner

Piping transformed read stream into request.post()

I'd like parse a log file and POST what is read to a request endpoint. I've managed to build a solution that generates a request for every log line read. However, it doesn't create any back pressure so it just flogs the server and I'd like to slow it down.
This lead me to investigate using stream pipes to see if I could route data from a file directly into request.post(). I can't get the post call to post a body object though.
var stream = require('stream');
var request = require('request');
var liner = new stream.Transform( { objectMode: true } );
liner._transform = function (chunk, encoding, done) {
var data = chunk.toString()
if (this._lastLineData) data = this._lastLineData + data
var lines = data.split('\n')
this._lastLineData = lines.splice(lines.length-1,1)[0]
var that = this;
lines.forEach(function(line) {
var line_obj = JSON.parse(line);
if( line_obj.url === "/api/usages" && line_obj.method === 'POST' ) {
var req_body = line_obj.body.body;
that.push.bind(req_body);
}
});
done();
}
var file_name = process.argv[2];
console.log('Reading from ' + file_name);
var fs = require('fs')
var liner = require('./liner')
var source = fs.createReadStream(file_name)
source.pipe(liner).pipe(request
.post("http://localhost:8081/api/usages")
.on('response', function(response) {
console.log(response.statusCode) // 200
})
.on('error', function(err) {
console.log(err);
}));
The push call in the transform function is working correctly, but it's not posting that object via the body in request.post().
What am I missing?
Will this provide the back pressure I'm looking for to throttle the POST calls before all of the file reads are completed?
I've discovered that you cannot pipe a stream to an HTTP request because you would need the Content-Length known before hand (as per spec). The less pleasant alternative is to multipart the upload - as chunks are read from your transform they would marshal parts to the receiving API. This also means the receiving API needs to be able to receive multipart uploads and reassemble the whole file after all parts have been received and confirmed. AWS S3 has multipart uploads and it might be a nice example: http://docs.aws.amazon.com/AmazonS3/latest/dev/mpuoverview.html
I wanted to pipe my transform data to another API that I manage but it seems the effort is not likely worth it considering my files really aren't that big. I'll update this answer if I change my mind :)
Although I wasn't able to find a solution to the streaming question, I found a simple solution to the back pressure question.
I used async.queue to push work into a simple task queue.
// build the send queue
var pool = new http.Agent({keepAlive: true, keepAliveMsecs: 10000, maxSockets: Math.floor(send_queue_concurrency*1.5)});
var q = async.queue(function(task, callback){
request({
url : 'http://localhost:8081/xxxxxx',
method : 'POST',
json : task.req_body,
gzip : true,
pool : pool,
timeout: 30000
}, function(error, response, body){
if(error) {
console.log('request error : ' + error);
post_status.fail++;
} else {
if( response.statusCode === 400 ) {
console.dir(body);
}
}
callback();
});
}, send_queue_concurrency);
q.drain = done;
send_queue_concurrency is the primary lever for controlling request pressure.
i'm pushing work into the queue with a file parsing routine :
rl.on('line', function(line) {
line_count++;
try {
var line_object = JSON.parse(line);
var req_body = line_object.body.body;
q.push({req_body:req_body, line_object:line_object}, function(err){
if (err){
console.log('queue error! '+JSON.stringify(err));
}
});
} catch( e ) {
console.dir(e);
}
});
var done = function() {
// print out some reporting stats...
// console.log('xxxxxx');
console.log('\ndone.');
process.exit(0);
};

How i can resume after error event in piped stream in nodejs?

After i emit error event in MyWritableStream, data transmission stops. What i need to do to resume data transfer?
var readable = fs.createReadStream('test.txt');
var writable = new MyWritableStream();
writable.on('error', function(error) {
console.log('error', error);
// How i can resume?
});
writable.on('finish', function(){
console.log('finished');
})
readable.pipe(writable);
I know this question is old, but you might wanna check out https://github.com/miraclx/xresilient
I built this for this exact same reason (works best with seekable streams).
You define a function that returns a readable stream, the library measures the number of bytes that have passed through until an error is met.
Once the readable stream encounters an error event, it recalls the defined function with the number of bytes read so you can index the stream source.
Example:
const fs = require('fs');
const xresilient = require('xresilient');
const readable = xresilient(({bytesRead}) => {
return generateSeekableStreamSomehow({start: bytesRead});
}, {retries: 5});
const writable = fs.createWriteStream('file.test');
readable.pipe(writable);
File streams are indexable with the start option of the fs.createReadStream() function.
HTTP Requests are indexable with the Range HTTP Header.
Check it out.
https://www.npmjs.com/package/xresilient
I am not sure, if it is a normal practice, but i can't see another solution for now & it works for me. If you can advise more accurate solution, please do it.
We can track readable stream instance using pipe event in writeable one:
function WriteableStream(options) {
Writable.call(this, options);
this.source = null;
var instance = this;
this.on('pipe', function(source){
instance.source = source;
});
}
util.inherits(WriteableStream, Writable);
So, when we emit error event, and readable stream is unpiped automatically, we can re-pipe it ourself:
WriteableStream.prototype._write = function(chunk, encoding, done) {
this.emit('error', new Error('test')); // unpipes readable
done();
};
WriteableStream.prototype.resume = function() {
this.source.pipe(this); // re-pipes readable
}
Finally, we will use it the following way:
var readable = fs.createReadStream(file);
var writeable = new WriteableStream();
writeable.on('error', function(error) {
console.log('error', error);
writeable.resume();
});
readable.pipe(writeable);

Stream read returning empty in nodejs

I am using express to create a webservice that will read string data from a stream, and respond to the HTTP POST request with that value. Here is the code for the S3Store.js file that defines the readFileFromS3(.) function:
S3Store.js
S3Store.prototype.readFileFromS3 = function(filename, callback) {
var readConfig = {
'Bucket': 'shubham-test',
'Key': filename
};
var readStream = this.s3.getObject(readConfig).createReadStream();
var allData = '';
readStream.on('data', function(data) {
//data = Buffer.concat([allData, data]);
data = allData + data;
console.log("data: " + data);
});
readStream.on('error', function(err) {
callback(err, null);
});
Now, if I call this method from a terminal like this:
s3Instance.readFileFromS3('123.json', function(err, data) {
console.log(data);
});
I see the appropriate string for data logged to the console. However, when I call the same method from inside one of the routes in express for HTTP POST requests, the service responds with a value of data set to empty string. Code for the POST request:
router.post('/resolve', function(req, res) {
var commandJson = req.body;
var appId = commandJson['appId'];
var command = commandJson['text'];
if (appId == undefined || command == undefined) {
res.status(400).send("Malformed Request: appId: " + appId + ", command: " + command);
};
s3Store.readFileFromS3('123.json', function(err, data) {
res.send(data);
});
});
Why does it return an empty string when calling the readFileFromS3(.) from the HTTP POST method and not when I ran the same method directly from the node console?
You're logging the data but you're not passing anything to the completion callback (see below for some more explanation):
S3Store.prototype.readFileFromS3 = function(filename, callback) {
var readConfig = {
'Bucket': 'shubham-test',
'Key': filename
};
var readStream = this.s3.getObject(readConfig).createReadStream();
var allData = [];
// Keep collecting data.
readStream.on('data', function(data) {
allData.push(data);
});
// Done reading, concatenate and pass to completion callback.
readStream.on('end', function() {
callback(null, Buffer.concat(allData));
});
// Handle any stream errors.
readStream.on('error', function(err) {
callback(err, null);
});
};
I took the liberty to rewrite the data collection to use Buffer's instead of strings, but this obviously isn't a requirement.
The callback argument is a completion function, meant to be called when either reading the S3 stream is done, or when it has thrown an error. The error handling was already in place, but not the part where you would call back when all the data from the stream was read, which is why I added the end handler.
At that point, the readStream is exhausted (everything from it has been read into allData), and you call the completion callback when the collected data as second argument.
The common idiom throughout Node is that completion callbacks take (at least) two arguments: the first is either an error, or null when there aren't errors, and the second is the data you want to pass back to the caller (in your case, the anonymous function in your route handler that calls res.send()).

How to force Node.js Transform stream to finish?

Consider the following scenario. I have two Node Transform streams:
Transform stream 1
function T1(options) {
if (! (this instanceof T1)) {
return new T1(options);
}
Transform.call(this, options);
}
util.inherits(T1, Transform);
T1.prototype._transform = function(chunk, encoding, done) {
console.log("### Transforming in t1");
this.push(chunk);
done();
};
T1.prototype._flush = function(done) {
console.log("### Done in t1");
done();
};
Transform stream 2
function T2(options) {
if (! (this instanceof T2)) {
return new T2(options);
}
Transform.call(this, options);
}
util.inherits(T2, Transform);
T2.prototype._transform = function(chunk, encoding, done) {
console.log("### Transforming in t2");
this.push(chunk);
done();
};
T2.prototype._flush = function(done) {
console.log("### Done in t2");
done();
};
And, I'm wanting to apply these transform streams before returning a response. I have a simple HTTP server, and on each request, I fetch a resource and would like these transformations to be applied to this fetched resource and then send the result of the second transformation to the original response:
var options = require('url').parse('http://localhost:1234/data.json');
options.method = 'GET';
http.createServer(function(req, res) {
var req = http.request(options, function(httpRes) {
var t1 = new T1({});
var t2 = new T2({});
httpRes
.pipe(t1)
.pipe(t2)
.on('finish', function() {
// Do other stuff in here before sending request back
t2.pipe(res, { end : true });
});
});
req.end();
}).listen(3001);
Ultimately, the finish event never gets called, and the request hangs and times out because the response is never resolved. I've noticed that if I just pipe t2 into res, it seems to work fine:
.pipe(t1)
.pipe(t2)
.pipe(res, { end : true });
But, this scenario doesn't seem feasible because I need to do some extra work before returning the response.
This happens because you need to let node know that the stream is being consumed somewhere, otherwise the last stream will just fill up the buffer and considering your data is longer than the highwaterMark option (usually 16) and then halt waiting for the data to be consumed.
There are three ways of consuming a stream in full:
piping to a readable stream (what you did in the second part of your question)
reading consecutive chunks by calling the read method of a stream
listening on "data" events (essentially stream.on("data", someFunc)).
The last option is the quickest, but will result in consuming the stream without looking at memory usage.
I'd also note that using the "finish" event might be a little misleading, since it is called when the last data is read, but not necessarily emitted. On a Transform stream, since it's a readable as well it's much better to use the "end" event.

Resources