Node.js Download File Using Content Disposition as Filename - node.js

I'm using the Request module to download files, but I'm not quite sure how to pipe the response to an output stream when the filename must come from the 'Content-Disposition' header. So basically, I need to read the response until the header is found, and then pipe the rest to that filename.
The examples show something like:
request('http://google.com/doodle.png').pipe(fs.createWriteStream('doodle.png'));
Where I want to do (pseudocode):
var req = request('http://example.com/download_latest_version?token=XXX');
var filename = req.response.headers['Content-Disposition'];
req.pipe(fs.createWriteStream(filename));
I could get the filename using the Request callback:
request(url, function(err, res, body) {
// get res headers here
});
But wouldn't that negate the benefits of using pipe and not loading the downloaded file into memory?

I'm reqesting a image from yahoo and it isn't using the content-disposition header but I am extracting the date and content-type headers to construct a filename. This seems close enough to what you're trying to do...
var request = require('request'),
fs = require('fs');
var url2 = 'http://l4.yimg.com/nn/fp/rsz/112113/images/smush/aaroncarter_635x250_1385060042.jpg';
var r = request(url2);
r.on('response', function (res) {
res.pipe(fs.createWriteStream('./' + res.headers.date + '.' + res.headers['content-type'].split('/')[1]));
});
Ignore my image choice please :)

Question has been around a while, but I today faced the same problem and solved it differently:
var Request = require( 'request' ),
Fs = require( 'fs' );
// RegExp to extract the filename from Content-Disposition
var regexp = /filename=\"(.*)\"/gi;
// initiate the download
var req = Request.get( 'url.to/somewhere' )
.on( 'response', function( res ){
// extract filename
var filename = regexp.exec( res.headers['content-disposition'] )[1];
// create file write stream
var fws = Fs.createWriteStream( '/some/path/' + filename );
// setup piping
res.pipe( fws );
res.on( 'end', function(){
// go on with processing
});
});

Here's my solution:
var fs = require('fs');
var request = require('request');
var through2 = require('through2');
var req = request(url);
req.on('error', function (e) {
// Handle connection errors
console.log(e);
});
var bufferedResponse = req.pipe(through2(function (chunk, enc, callback) {
this.push(chunk);
callback()
}));
req.on('response', function (res) {
if (res.statusCode === 200) {
try {
var contentDisposition = res.headers['content-disposition'];
var match = contentDisposition && contentDisposition.match(/(filename=|filename\*='')(.*)$/);
var filename = match && match[2] || 'default-filename.out';
var dest = fs.createWriteStream(filename);
dest.on('error', function (e) {
// Handle write errors
console.log(e);
});
dest.on('finish', function () {
// The file has been downloaded
console.log('Downloaded ' + filename);
});
bufferedResponse.pipe(dest);
} catch (e) {
// Handle request errors
console.log(e);
}
}
else {
// Handle HTTP server errors
console.log(res.statusCode);
}
});
The other solutions posted here use res.pipe, which can fail if the content is transferred using gzip encoding, because the response stream contains the raw (compressed) HTTP data. To avoid this problem you have to use request.pipe instead. (See the second example at https://github.com/request/request#examples.)
When using request.pipe I was getting an error: "You cannot pipe after data has been emitted from the response.", because I was doing some async stuff before actually piping (creating a directory to hold the downloaded file). I also had some problems where the file was being written with no content, which might have been due to request reading the HTTP response and buffering it.
So I ended up creating an intermediate buffering stream with through2, so that I could pipe the request to it before the response handler fires, then later piping from the buffering stream into the file stream once the filename is known.
Finally, I'm parsing the content disposition header whether the filename is encoded in plain form or in UTF-8 form using the filename*=''file.txt syntax.
I hope this helps someone else who experiences the same issues that I had.

Related

How to pipe data modified from a gunzip stream into a gzip stream?

I need to trigger, through an http request, a process where I download some data from S3, gunzip it, modify the stream, gzip it and send to another bucket in S3.
So far I was able to either:
Download
Gunzip
Modify (filter) the data
return the data
Or:
Download
Gunzip
Gzip
Upload the unmodified data and retrieve the url of the object
My first attempt at this consisted in using the on('data') event from the gunzip stream to modify the data; then when the 'end' event is thrown, I can return it to the browser making the request.
var accumulator = [];
gunzip.on('data', chunk=>{
var lines = chunk.toString('utf-8').split(\n);
lines.forEach(line=>{
if(shouldBeFiltered(line)){
accumulator.push(line);
}
})
})
gunzip.on('end', ()=>{
res.send(accumulator);
})
getS3.pipe(gunzip)
If instead of returning the result (res.send) I try to pipe gunzip to gzip, the filter is ignored. It makes sense as I have an accumulator array that I return (in the previous case) when the end event is thrown.
Then after some digging, I found a reference suggesting that the data should be pushed to, and I tried the following, which did not work:
gunzip.on('data', chunk=>{
var lines = chunk.toString('utf-8').split(\n);
lines.forEach(line=>{
if(shouldBeFiltered(line)){
gunzip.push(line);
}
})
})
// the end event no longer mattered
// gunzip.on('end', ()=>{
// res.send(accumulator);
// })
getS3.pipe(gunzip).pipe(gzip).pipe(putS3(putS3param.Key, putS3param.Bucket));
Then I tried to create a transform stream (this is extremely simplified as I was trying the concept), but then I had an internal error:
const stream = require('stream');
const Transform = stream.Transform;
function filter(pipeline) {
var the_filter = new Transform({
transform(chunk, encoding, next) {
console.log();
chunk += Buffer('Modified', 'utf-8');
this.push(chunk);
next();
}
});
pipeline.pipe(the_filter);
}
Other than creating a file and gziping it and uploading I have no more ideas.
Thanks for any help!
After much digging around, I finally found the answer in this page
It seems that what was missing as setting Transform as objectMode, other than that, I see nothing relevant.
var stream = require('stream')
var liner = new stream.Transform( { objectMode: true } )
liner._transform = function (chunk, encoding, done) {
var data = chunk.toString()
if (this._lastLineData) data = this._lastLineData + data
var lines = data.split('\n')
this._lastLineData = lines.splice(lines.length-1,1)[0]
lines.forEach(this.push.bind(this))
done()
}
liner._flush = function (done) {
if (this._lastLineData) this.push(this._lastLineData)
this._lastLineData = null
done()
}
module.exports = liner

Error : SyntaxError: Unexpected end of JSON input

I am trying to fetch a JSON file from a api website using the below code but i am getting an error saying "Unexpected end of JSON input" when I fetch using the below code
var express = require("express");
var app = express();
var body = require("body-parser");
var https = require("https");
app.get("/results", function (req, res) {
https.get("https://www.omdbapi.com/?apikey=d49698c3&s=harry", function (response) {
response.on("data", function (data) {
var got = JSON.parse(data);
res.send(got.Title);
})
})
});
There are a couple of wrong assumptions in your code.
1 - Looking at the data, you can see that the property Title is in every single object inside the Search property, which is an Array, thus when the JSON gets correctly parsed, you'll have an issue with got.Title
2 - The event data of the object response can be called multiple times until you can get all the data, every time it gets called, you receive a chunk of the data.
Once all the data has been sent the end event is called and there is the place to parse the JSON.
Your code is trying to parse an incomplete JSON string, just the first chunk.
The most common approach is to declare an array outside the functions that will handle the events data and end. For every data event you push the chunk into the outside array and on the end event you concatenate it.
Check this out:
var express = require('express')
var app = express()
var body = require('body-parser')
var https = require('https')
app.get('/results', function (req, res) {
https.get('https://www.omdbapi.com/?apikey=d49698c3&s=harry', function (response) {
const chunks = []
response.on('data', function (chunk) {
chunks.push(chunk)
})
response.on('end', function () {
const data = Buffer.concat(chunks)
var got = JSON.parse(data)
// Try this one out as well
// res.json(got)
res.send(got.Search[0].Title)
})
})
})
app.listen(3000)
I just tested the code above and it works like a charm.
In case of large json object you would want to parse the data in chunks.
Receive all the chunks on data event and club the data from as soon as it arrives and parse the concatenated json on the end event
const express = require("express");
const app = express();
const body = require("body-parser");
const https = require("https");
app.get("/results", function (req, res) {
https.get("https://www.omdbapi.com/?apikey=d49698c3&s=harry", function (response) {
let finalData = '';
response.on("data", function (data) {
finalData += data.toString();
});
response.on("end", function() {
const parsedData = JSON.parse(finalData);
res.send(parsedData.Title);
}
})
});
app.listen(3000)
For everyone who's having the same issue. I found a solution:
As what #Maestre San - has explained, you're parsing an incomplete JSON data. That's why you're getting the error. I've tried both solutions suggested by both but it still wasn't working for me. Hence, I researched and found out that first of all, you will need to store the data in an empty variable, once the data stream is done, you can then parse it by doing the following:
response.on("end", function () {
var jsonParse = JSON.parse(newsItems);
});
The full code is:
app.get("/", function (req, res) {
const queryString = "mamamoo";
const url = "https://newsapi.org/v2/everything?apiKey=<API_KEY>&qInTitle=" + queryString;
https.get(url, function (response) {
var newsItems = '';
response.on("data", function (data) {
newsItems += data;
});
response.on("end", function () {
var jsonParse = JSON.parse(newsItems);
console.log(jsonParse);
});
});
});
So as to explain what I did:
First, I make sure to catch any request made to my server. I then used the native node https request to make a request to the api, catching the response (the response contains statusCode and other response body).
However, I want to access the data body, so I performed "response.on". First, I captured the data in chunks and stored it in an empty variable.
Why store it in an empty variable? Because if I didn't and continued to parse it, it will throw an error saying "Unexpected end of JSON input", meaning, I'm parsing an incomplete JSON data.
Next step I did is, parse the data WHEN the data stream is done by specifying the "end".
Hope this helped.

Corrupted file using request js

I am using request js to download a file.
function requ(){
const options = {
uri: `api/tasks/${id}/attachments/${attachmentId}`
}
return rp.get(options)
}
My question is:
why piping to "res" like requ().pipe(res) works and returning the result of the request above using "send" like
requ().then((result)=>{
//here result is the file's representing string
res.send(result)
})
don't?
const fs = require('fs');
requ().then((result) => {
//here result is the file's representing string
const path = __dirname + '/tempFiles' + Date.now(); // a temporary file to send it
fs.writeFile(path, result, function(err) {
if(err) throw err;
return res.sendFile(path);
})
});
Read More About fs, link 2
My file was being corrupted because request was converting the response body to utf8. Using:
const options = {
uri: `api/tasks/${id}/attachments/${attachmentId}`,
encoding:null
}
fixed the problem

Piping transformed read stream into request.post()

I'd like parse a log file and POST what is read to a request endpoint. I've managed to build a solution that generates a request for every log line read. However, it doesn't create any back pressure so it just flogs the server and I'd like to slow it down.
This lead me to investigate using stream pipes to see if I could route data from a file directly into request.post(). I can't get the post call to post a body object though.
var stream = require('stream');
var request = require('request');
var liner = new stream.Transform( { objectMode: true } );
liner._transform = function (chunk, encoding, done) {
var data = chunk.toString()
if (this._lastLineData) data = this._lastLineData + data
var lines = data.split('\n')
this._lastLineData = lines.splice(lines.length-1,1)[0]
var that = this;
lines.forEach(function(line) {
var line_obj = JSON.parse(line);
if( line_obj.url === "/api/usages" && line_obj.method === 'POST' ) {
var req_body = line_obj.body.body;
that.push.bind(req_body);
}
});
done();
}
var file_name = process.argv[2];
console.log('Reading from ' + file_name);
var fs = require('fs')
var liner = require('./liner')
var source = fs.createReadStream(file_name)
source.pipe(liner).pipe(request
.post("http://localhost:8081/api/usages")
.on('response', function(response) {
console.log(response.statusCode) // 200
})
.on('error', function(err) {
console.log(err);
}));
The push call in the transform function is working correctly, but it's not posting that object via the body in request.post().
What am I missing?
Will this provide the back pressure I'm looking for to throttle the POST calls before all of the file reads are completed?
I've discovered that you cannot pipe a stream to an HTTP request because you would need the Content-Length known before hand (as per spec). The less pleasant alternative is to multipart the upload - as chunks are read from your transform they would marshal parts to the receiving API. This also means the receiving API needs to be able to receive multipart uploads and reassemble the whole file after all parts have been received and confirmed. AWS S3 has multipart uploads and it might be a nice example: http://docs.aws.amazon.com/AmazonS3/latest/dev/mpuoverview.html
I wanted to pipe my transform data to another API that I manage but it seems the effort is not likely worth it considering my files really aren't that big. I'll update this answer if I change my mind :)
Although I wasn't able to find a solution to the streaming question, I found a simple solution to the back pressure question.
I used async.queue to push work into a simple task queue.
// build the send queue
var pool = new http.Agent({keepAlive: true, keepAliveMsecs: 10000, maxSockets: Math.floor(send_queue_concurrency*1.5)});
var q = async.queue(function(task, callback){
request({
url : 'http://localhost:8081/xxxxxx',
method : 'POST',
json : task.req_body,
gzip : true,
pool : pool,
timeout: 30000
}, function(error, response, body){
if(error) {
console.log('request error : ' + error);
post_status.fail++;
} else {
if( response.statusCode === 400 ) {
console.dir(body);
}
}
callback();
});
}, send_queue_concurrency);
q.drain = done;
send_queue_concurrency is the primary lever for controlling request pressure.
i'm pushing work into the queue with a file parsing routine :
rl.on('line', function(line) {
line_count++;
try {
var line_object = JSON.parse(line);
var req_body = line_object.body.body;
q.push({req_body:req_body, line_object:line_object}, function(err){
if (err){
console.log('queue error! '+JSON.stringify(err));
}
});
} catch( e ) {
console.dir(e);
}
});
var done = function() {
// print out some reporting stats...
// console.log('xxxxxx');
console.log('\ndone.');
process.exit(0);
};

Fetch 100 zip files in node

So I'm trying to fetch a bunch of files from a server. The current code is basically as follows.
var http = require('http');
var fs = require('fs');
var arr = [{id:'fileOne', id:'fileTwo', id:'fileThree',....];
function fetchData() {
for (var i = 0; i < arr.length; i++) {
var file = fs.createWriteStream("../path/file.zip");
var request = http.get("url/AFG_adm.zip", function(response) {
response.pipe(file);
});
}
}
I don't think this is the best approach, trying to figure out how to handle errors, how to make sure that a file gets loaded before the next iteration... Any help is much appreciated.
You should use the async module for handling the async part, also the request module will save you a lot of effort.
You can handle this in many ways using either async.cargo or async.map.
The theory is to group up things or a series of things, and then take action according to what you want it to do, but in async way.
so a basic .map of an array of files to download would be like this.
// required modules
var async = require('async');
var request = require('request');
// array of urls
var URLs = ['hxxp://...ZipFile1.zip', 'hxxp://...ZipFile2.zip'];
// destination directory
var destinationDirectory = 'downloads';
// asyncDownload function
function asyncDownload(url, callback) {
// get filename
var filename = url.substring(url.lastIndexOf(".") + 1);
// create write stream
var stream = fs.createWriteStream(destinationDirectory + "/" + filename);
// listen for open event to start request and pipe
stream.on('open', function () {
request(url).pipe(stream);
})
// when finish , call callback
stream.on('finish', function () {
callback(null, destinationDirectory + "/" + filename);
})
}
async.map(
URLs, asyncDownload, function (err, results) {
console.log(results);
});

Resources