I'd like parse a log file and POST what is read to a request endpoint. I've managed to build a solution that generates a request for every log line read. However, it doesn't create any back pressure so it just flogs the server and I'd like to slow it down.
This lead me to investigate using stream pipes to see if I could route data from a file directly into request.post(). I can't get the post call to post a body object though.
var stream = require('stream');
var request = require('request');
var liner = new stream.Transform( { objectMode: true } );
liner._transform = function (chunk, encoding, done) {
var data = chunk.toString()
if (this._lastLineData) data = this._lastLineData + data
var lines = data.split('\n')
this._lastLineData = lines.splice(lines.length-1,1)[0]
var that = this;
lines.forEach(function(line) {
var line_obj = JSON.parse(line);
if( line_obj.url === "/api/usages" && line_obj.method === 'POST' ) {
var req_body = line_obj.body.body;
that.push.bind(req_body);
}
});
done();
}
var file_name = process.argv[2];
console.log('Reading from ' + file_name);
var fs = require('fs')
var liner = require('./liner')
var source = fs.createReadStream(file_name)
source.pipe(liner).pipe(request
.post("http://localhost:8081/api/usages")
.on('response', function(response) {
console.log(response.statusCode) // 200
})
.on('error', function(err) {
console.log(err);
}));
The push call in the transform function is working correctly, but it's not posting that object via the body in request.post().
What am I missing?
Will this provide the back pressure I'm looking for to throttle the POST calls before all of the file reads are completed?
I've discovered that you cannot pipe a stream to an HTTP request because you would need the Content-Length known before hand (as per spec). The less pleasant alternative is to multipart the upload - as chunks are read from your transform they would marshal parts to the receiving API. This also means the receiving API needs to be able to receive multipart uploads and reassemble the whole file after all parts have been received and confirmed. AWS S3 has multipart uploads and it might be a nice example: http://docs.aws.amazon.com/AmazonS3/latest/dev/mpuoverview.html
I wanted to pipe my transform data to another API that I manage but it seems the effort is not likely worth it considering my files really aren't that big. I'll update this answer if I change my mind :)
Although I wasn't able to find a solution to the streaming question, I found a simple solution to the back pressure question.
I used async.queue to push work into a simple task queue.
// build the send queue
var pool = new http.Agent({keepAlive: true, keepAliveMsecs: 10000, maxSockets: Math.floor(send_queue_concurrency*1.5)});
var q = async.queue(function(task, callback){
request({
url : 'http://localhost:8081/xxxxxx',
method : 'POST',
json : task.req_body,
gzip : true,
pool : pool,
timeout: 30000
}, function(error, response, body){
if(error) {
console.log('request error : ' + error);
post_status.fail++;
} else {
if( response.statusCode === 400 ) {
console.dir(body);
}
}
callback();
});
}, send_queue_concurrency);
q.drain = done;
send_queue_concurrency is the primary lever for controlling request pressure.
i'm pushing work into the queue with a file parsing routine :
rl.on('line', function(line) {
line_count++;
try {
var line_object = JSON.parse(line);
var req_body = line_object.body.body;
q.push({req_body:req_body, line_object:line_object}, function(err){
if (err){
console.log('queue error! '+JSON.stringify(err));
}
});
} catch( e ) {
console.dir(e);
}
});
var done = function() {
// print out some reporting stats...
// console.log('xxxxxx');
console.log('\ndone.');
process.exit(0);
};
Related
I need to trigger, through an http request, a process where I download some data from S3, gunzip it, modify the stream, gzip it and send to another bucket in S3.
So far I was able to either:
Download
Gunzip
Modify (filter) the data
return the data
Or:
Download
Gunzip
Gzip
Upload the unmodified data and retrieve the url of the object
My first attempt at this consisted in using the on('data') event from the gunzip stream to modify the data; then when the 'end' event is thrown, I can return it to the browser making the request.
var accumulator = [];
gunzip.on('data', chunk=>{
var lines = chunk.toString('utf-8').split(\n);
lines.forEach(line=>{
if(shouldBeFiltered(line)){
accumulator.push(line);
}
})
})
gunzip.on('end', ()=>{
res.send(accumulator);
})
getS3.pipe(gunzip)
If instead of returning the result (res.send) I try to pipe gunzip to gzip, the filter is ignored. It makes sense as I have an accumulator array that I return (in the previous case) when the end event is thrown.
Then after some digging, I found a reference suggesting that the data should be pushed to, and I tried the following, which did not work:
gunzip.on('data', chunk=>{
var lines = chunk.toString('utf-8').split(\n);
lines.forEach(line=>{
if(shouldBeFiltered(line)){
gunzip.push(line);
}
})
})
// the end event no longer mattered
// gunzip.on('end', ()=>{
// res.send(accumulator);
// })
getS3.pipe(gunzip).pipe(gzip).pipe(putS3(putS3param.Key, putS3param.Bucket));
Then I tried to create a transform stream (this is extremely simplified as I was trying the concept), but then I had an internal error:
const stream = require('stream');
const Transform = stream.Transform;
function filter(pipeline) {
var the_filter = new Transform({
transform(chunk, encoding, next) {
console.log();
chunk += Buffer('Modified', 'utf-8');
this.push(chunk);
next();
}
});
pipeline.pipe(the_filter);
}
Other than creating a file and gziping it and uploading I have no more ideas.
Thanks for any help!
After much digging around, I finally found the answer in this page
It seems that what was missing as setting Transform as objectMode, other than that, I see nothing relevant.
var stream = require('stream')
var liner = new stream.Transform( { objectMode: true } )
liner._transform = function (chunk, encoding, done) {
var data = chunk.toString()
if (this._lastLineData) data = this._lastLineData + data
var lines = data.split('\n')
this._lastLineData = lines.splice(lines.length-1,1)[0]
lines.forEach(this.push.bind(this))
done()
}
liner._flush = function (done) {
if (this._lastLineData) this.push(this._lastLineData)
this._lastLineData = null
done()
}
module.exports = liner
I'm trying to fetch json from an api but only half of the response is received. So how to get the full response?
var request = require('request');
var url_to_check = 'http://example.com/api/test';
request.get(url_to_check).on('data', function(data) {
// Only half of the data is printed (8192). Remaining bytes are lost.
console.log(data.toString());
})
Your code is correct the only mistake you made is that you are streaming request data so you won't get whole data on event 'data' if a response is large. You will have to collect chunk and consolidate on event 'end'. check this code snippet
var request = require('request');
var url = 'https://reqres.in/api/users';
var req = request.get(url)
var data = []
req.on('data',function(chunk){
data.push(chunk))
})
req.on('end',function(){
console.log(Buffer.concat(data).toString())
})
And If you don't want to stream and pipe data and also response size is small then you can try this:
request.get(url, function(err, response, responseBody) {
if (!err) {
// var jsonRes = JSON.parse(JSON.stringify(response))
// responseBody = jsonRes.body
console.log(responseBody)
} else {
// handle error here
}
})
I am using express to create a webservice that will read string data from a stream, and respond to the HTTP POST request with that value. Here is the code for the S3Store.js file that defines the readFileFromS3(.) function:
S3Store.js
S3Store.prototype.readFileFromS3 = function(filename, callback) {
var readConfig = {
'Bucket': 'shubham-test',
'Key': filename
};
var readStream = this.s3.getObject(readConfig).createReadStream();
var allData = '';
readStream.on('data', function(data) {
//data = Buffer.concat([allData, data]);
data = allData + data;
console.log("data: " + data);
});
readStream.on('error', function(err) {
callback(err, null);
});
Now, if I call this method from a terminal like this:
s3Instance.readFileFromS3('123.json', function(err, data) {
console.log(data);
});
I see the appropriate string for data logged to the console. However, when I call the same method from inside one of the routes in express for HTTP POST requests, the service responds with a value of data set to empty string. Code for the POST request:
router.post('/resolve', function(req, res) {
var commandJson = req.body;
var appId = commandJson['appId'];
var command = commandJson['text'];
if (appId == undefined || command == undefined) {
res.status(400).send("Malformed Request: appId: " + appId + ", command: " + command);
};
s3Store.readFileFromS3('123.json', function(err, data) {
res.send(data);
});
});
Why does it return an empty string when calling the readFileFromS3(.) from the HTTP POST method and not when I ran the same method directly from the node console?
You're logging the data but you're not passing anything to the completion callback (see below for some more explanation):
S3Store.prototype.readFileFromS3 = function(filename, callback) {
var readConfig = {
'Bucket': 'shubham-test',
'Key': filename
};
var readStream = this.s3.getObject(readConfig).createReadStream();
var allData = [];
// Keep collecting data.
readStream.on('data', function(data) {
allData.push(data);
});
// Done reading, concatenate and pass to completion callback.
readStream.on('end', function() {
callback(null, Buffer.concat(allData));
});
// Handle any stream errors.
readStream.on('error', function(err) {
callback(err, null);
});
};
I took the liberty to rewrite the data collection to use Buffer's instead of strings, but this obviously isn't a requirement.
The callback argument is a completion function, meant to be called when either reading the S3 stream is done, or when it has thrown an error. The error handling was already in place, but not the part where you would call back when all the data from the stream was read, which is why I added the end handler.
At that point, the readStream is exhausted (everything from it has been read into allData), and you call the completion callback when the collected data as second argument.
The common idiom throughout Node is that completion callbacks take (at least) two arguments: the first is either an error, or null when there aren't errors, and the second is the data you want to pass back to the caller (in your case, the anonymous function in your route handler that calls res.send()).
I'm using the Request module to download files, but I'm not quite sure how to pipe the response to an output stream when the filename must come from the 'Content-Disposition' header. So basically, I need to read the response until the header is found, and then pipe the rest to that filename.
The examples show something like:
request('http://google.com/doodle.png').pipe(fs.createWriteStream('doodle.png'));
Where I want to do (pseudocode):
var req = request('http://example.com/download_latest_version?token=XXX');
var filename = req.response.headers['Content-Disposition'];
req.pipe(fs.createWriteStream(filename));
I could get the filename using the Request callback:
request(url, function(err, res, body) {
// get res headers here
});
But wouldn't that negate the benefits of using pipe and not loading the downloaded file into memory?
I'm reqesting a image from yahoo and it isn't using the content-disposition header but I am extracting the date and content-type headers to construct a filename. This seems close enough to what you're trying to do...
var request = require('request'),
fs = require('fs');
var url2 = 'http://l4.yimg.com/nn/fp/rsz/112113/images/smush/aaroncarter_635x250_1385060042.jpg';
var r = request(url2);
r.on('response', function (res) {
res.pipe(fs.createWriteStream('./' + res.headers.date + '.' + res.headers['content-type'].split('/')[1]));
});
Ignore my image choice please :)
Question has been around a while, but I today faced the same problem and solved it differently:
var Request = require( 'request' ),
Fs = require( 'fs' );
// RegExp to extract the filename from Content-Disposition
var regexp = /filename=\"(.*)\"/gi;
// initiate the download
var req = Request.get( 'url.to/somewhere' )
.on( 'response', function( res ){
// extract filename
var filename = regexp.exec( res.headers['content-disposition'] )[1];
// create file write stream
var fws = Fs.createWriteStream( '/some/path/' + filename );
// setup piping
res.pipe( fws );
res.on( 'end', function(){
// go on with processing
});
});
Here's my solution:
var fs = require('fs');
var request = require('request');
var through2 = require('through2');
var req = request(url);
req.on('error', function (e) {
// Handle connection errors
console.log(e);
});
var bufferedResponse = req.pipe(through2(function (chunk, enc, callback) {
this.push(chunk);
callback()
}));
req.on('response', function (res) {
if (res.statusCode === 200) {
try {
var contentDisposition = res.headers['content-disposition'];
var match = contentDisposition && contentDisposition.match(/(filename=|filename\*='')(.*)$/);
var filename = match && match[2] || 'default-filename.out';
var dest = fs.createWriteStream(filename);
dest.on('error', function (e) {
// Handle write errors
console.log(e);
});
dest.on('finish', function () {
// The file has been downloaded
console.log('Downloaded ' + filename);
});
bufferedResponse.pipe(dest);
} catch (e) {
// Handle request errors
console.log(e);
}
}
else {
// Handle HTTP server errors
console.log(res.statusCode);
}
});
The other solutions posted here use res.pipe, which can fail if the content is transferred using gzip encoding, because the response stream contains the raw (compressed) HTTP data. To avoid this problem you have to use request.pipe instead. (See the second example at https://github.com/request/request#examples.)
When using request.pipe I was getting an error: "You cannot pipe after data has been emitted from the response.", because I was doing some async stuff before actually piping (creating a directory to hold the downloaded file). I also had some problems where the file was being written with no content, which might have been due to request reading the HTTP response and buffering it.
So I ended up creating an intermediate buffering stream with through2, so that I could pipe the request to it before the response handler fires, then later piping from the buffering stream into the file stream once the filename is known.
Finally, I'm parsing the content disposition header whether the filename is encoded in plain form or in UTF-8 form using the filename*=''file.txt syntax.
I hope this helps someone else who experiences the same issues that I had.
I need to stream a file in base64 to an http endpoint using something like request or superagent. What is the best way to figure out what percentage of the file has been uploaded?
I assume I can create the read stream using something like:
fs.createReadStream('/tmp/cats.jpg', {encoding: 'base64'})
Any examples using one out of above libraries would be greatly appreciated.
I think you can use progress-stream.
Here is an example from the package:
var progress = require('progress-stream');
var fs = require('fs');
var stat = fs.statSync(filename);
var str = progress({
length: stat.size,
time: 100 /* ms */
});
str.on('progress', function(progress) {
console.log(progress);
/*
{
percentage: 9.05,
transferred: 949624,
length: 10485760,
remaining: 9536136,
eta: 42,
runtime: 3,
delta: 295396,
speed: 949624
}
*/
});
fs.createReadStream(filename)
.pipe(str)
.pipe(fs.createWriteStream(output));
I was looking for an answer to a similar issue and thanks to Alberto Zaccagni's answer, I was able to get some code working.
So for the people who don't want to piece the puzzle themselves, here is the code (edited for Stackoverflow):
var zipfile = "my_large_archive.zip";
// Get the size of the file
fs.stat(zipfile, function (err, stats) {
var zipSize = stats.size;
var uploadedSize = 0; // Incremented by on('data') to keep track of the amount of data we've uploaded
// Create a new read stream so we can plug events on it, and get the upload progress
var zipReadStream = fs.createReadStream(zipfile);
zipReadStream.on('data', function(buffer) {
var segmentLength = buffer.length;
// Increment the uploaded data counter
uploadedSize += segmentLength;
// Display the upload percentage
console.log("Progress:\t",((uploadedSize/zipSize*100).toFixed(2)+"%"));
});
// Some other events you might want for your code
zipReadStream.on('end', function() {
console.log("Event: end");
});
zipReadStream.on('close', function() {
console.log("Event: close");
});
var formData = require('form-data');
var form = new formData();
form.append('apikey', 'f4sd5f4sdf6ds456'); // Just some post parameters I need to send to the upload endpoint
form.append('file', zipReadStream); // The zip file, passed as a fs.createReadStream instance
// Submit the form and the file
form.submit('http://www.someserver.com/upload', function(err, res) {
if (err) {
console.log("Oups! We encountered an error :(\n\n", err);
return false;
}
console.log("Your file has been uploaded.");
res.resume(); // Fix is you use that code for a CLI, so that the execution will stop and let users enter new commands
});
});
In nodejs we have the Readable stream, it emits the data event when it receives a chunk of data, by knowing the file size you could easily keep track of how much data passes through the data event receiver and then update the percentage.
Get the file dimension with
require('fs').watchFile('yourfile', function () {
fs.stat('yourfile', function (err, stats) {
console.log(stats.size);
});
});