Fetch 100 zip files in node - node.js

So I'm trying to fetch a bunch of files from a server. The current code is basically as follows.
var http = require('http');
var fs = require('fs');
var arr = [{id:'fileOne', id:'fileTwo', id:'fileThree',....];
function fetchData() {
for (var i = 0; i < arr.length; i++) {
var file = fs.createWriteStream("../path/file.zip");
var request = http.get("url/AFG_adm.zip", function(response) {
response.pipe(file);
});
}
}
I don't think this is the best approach, trying to figure out how to handle errors, how to make sure that a file gets loaded before the next iteration... Any help is much appreciated.

You should use the async module for handling the async part, also the request module will save you a lot of effort.
You can handle this in many ways using either async.cargo or async.map.
The theory is to group up things or a series of things, and then take action according to what you want it to do, but in async way.
so a basic .map of an array of files to download would be like this.
// required modules
var async = require('async');
var request = require('request');
// array of urls
var URLs = ['hxxp://...ZipFile1.zip', 'hxxp://...ZipFile2.zip'];
// destination directory
var destinationDirectory = 'downloads';
// asyncDownload function
function asyncDownload(url, callback) {
// get filename
var filename = url.substring(url.lastIndexOf(".") + 1);
// create write stream
var stream = fs.createWriteStream(destinationDirectory + "/" + filename);
// listen for open event to start request and pipe
stream.on('open', function () {
request(url).pipe(stream);
})
// when finish , call callback
stream.on('finish', function () {
callback(null, destinationDirectory + "/" + filename);
})
}
async.map(
URLs, asyncDownload, function (err, results) {
console.log(results);
});

Related

How to disconnect a socket after streaming data?

I am making use of "socket.io-client" and "socket.io stream" to make a request and then stream some data. I have the following code that handles this logic
Client Server Logic
router.get('/writeData', function(req, res) {
var io = req.app.get('socketio');
var nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
var nameNodeData = {};
async.waterfall([
checkForDataNodes,
readFileFromS3
], function(err, result) {
if (err !== null) {
res.json(err);
}else{
res.json("Finished Writing to DN's");
}
});
function checkForDataNodes(cb) {
nameNodeSocket.on('nameNodeData', function(data) {
nameNodeData = data;
console.log(nameNodeData);
cb(null, nameNodeData);
});
if (nameNodeData.numDataNodes === 0) {
cb("No datanodes found");
}
}
function readFileFromS3(nameNodeData, cb) {
for (var i in nameNodeData['blockToDataNodes']) {
var IP = nameNodeData['blockToDataNodes'][i]['ipValue'];
var dataNodeSocket = io.connect('http://'+ IP +":5000");
var ss = require("socket.io-stream");
var stream = ss.createStream();
var byteStartRange = nameNodeData['blockToDataNodes'][i]['byteStart'];
var byteStopRange = nameNodeData['blockToDataNodes'][i]['byteStop'];
paramsWithRange['Range'] = "bytes=" + byteStartRange.toString() + "-" + byteStopRange.toString();
//var file = require('fs').createWriteStream('testFile' + i + '.txt');
var getFileName = nameNodeData['blockToDataNodes'][i]['key'].split('/');
var fileData = {
'mainFile': paramsWithRange['Key'].split('/')[1],
'blockName': getFileName[1]
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
s3.getObject(paramsWithRange).createReadStream().pipe(stream);
//dataNodeSocket.disconnect();
}
cb(null);
}
});
Server Logic (that gets the data)
var dataNodeIO = require('socket.io')(server);
var ss = require("socket.io-stream");
dataNodeIO.on('connection', function(socket) {
console.log("Succesfully connected!");
ss(socket).on('sendData', function(stream, data) {
var IP = data['ipValue'];
var blockName = data['blockName'];
var mainFile = data['mainFile'];
dataNode.makeDir(mainFile);
dataNode.addToReport(mainFile, blockName);
stream.pipe(fs.createWriteStream(mainFile + '/' + blockName));
});
});
How can I properly disconnect the connections in function readFileFromS3. I have noticed using dataNodeSocket.disconnect() at the end does not work as I cannot verify the data was received on the 2nd server. But if I comment it out, I can see the data being streamed to the second server.
My objective is to close the connections in Client Server side
It appears that the main problem with closing the socket is that you weren't waiting for the stream to be done writing before trying to close the socket. So, because the writing is all asynchronous and finishes sometime later, you were trying to close the socket before the data had been written.
Also because you were putting asynchronous operations inside a for loop, you were also running all your operations in parallel which may not be exactly what you want as it makes error handling more difficult and server load more difficult.
Here's the code I would suggest that does the following:
Create a function streamFileFromS3() that streams a single file and returns a promise that will notify when it's done.
Use await in a for loop with that streamFileFromS3() to serialize the operations. You don't have to serialize them, but then you would have to change your error handling to figure out what to do if one errors while the others are already running and you'd have to be more careful about concurrency issues.
Use try/catch to catch any errors from streamFileFromS3().
Add error handling on the stream.
Change all occurrences of data['propertyName'] to data.propertyName. The only time you need to use brackets is if the property name contains a character that is not allowed in a Javascript identifier or if the property name is in a variable. Otherwise, the dot notation is preferred.
Add socket.io connection error handling logic for both socket.io connections.
Set returned status to 500 when there's an error processing the request
So, here's the code for that:
const ss = require("socket.io-stream");
router.get('/writeData', function(req, res) {
const io = req.app.get('socketio');
function streamFileFromS3(ip, data) {
return new Promise((resolve, reject) => {
const dataNodeSocket = io.connect(`http://${ip}:5000`);
dataNodeSocket.on('connect_error', reject);
dataNodeSocket.on('connect_timeout', () {
reject(new Error(`timeout connecting to http://${ip}:5000`));
});
dataNodeSocket.on('connection', () => {
// dataNodeSocket connected now
const stream = ss.createStream().on('error', reject);
paramsWithRange.Range = `bytes=${data.byteStart}-${data.byteStop}`;
const filename = data.key.split('/')[1];
const fileData = {
'mainFile': paramsWithRange.Key.split('/')[1],
'blockName': filename
};
ss(dataNodeSocket).emit('sendData', stream, fileData);
// get S3 data and pipe it to the socket.io stream
s3.getObject(paramsWithRange).createReadStream().on('error', reject).pipe(stream);
stream.on('close', () => {
dataNodeSocket.disconnect();
resolve();
});
});
});
}
function connectError(msg) {
res.status(500).send(`Error connecting to ${NAMENODE_ADDRESS}`);
}
const nameNodeSocket = io.connect(NAMENODE_ADDRESS, { reconnect: true });
nameNodeSocket.on('connect_error', connectError).on('connect_timeout', connectError);
nameNodeSocket.on('nameNodeData', async (nameNodeData) => {
try {
for (let item of nameNodeData.blockToDataNodes) {
await streamFileFromS3(item.ipValue, item);
}
res.json("Finished Writing to DN's");
} catch(e) {
res.status(500).json(e);
}
});
});
Other notes:
I don't know what paramsWithRange is as it is not declared here and when you were doing everything in parallel, it was getting shared among all the connections which is asking for a concurrency issue. In my serialized implementation, it's probably safe to share it, but the way it is now bothers me as it's a concurrency issue waiting to happen.

How to Synchronize the file writes in Node.Js

I am using the EJS compile to create notification templates and I would like to know how to write the file to the file system in parallel and send the notification once all the files are saved.
Please see the below code snippet which I used
var fs = require('fs');
var ejs = require('ejs');
var arrayOfData = [someData]; //Prepare data from database
//Iterate through the data
for (var i = 0; i < arrayOfData.length; i++) {
generateFileFromTemplate(arrayOfData[i],function(){});
}
function generateFileFromTemplate(templateData,callback)
{
var outputFile = fileData.Id + ".html";
var compiled = ejs.compile(fs.readFileSync('email-template.ejs', 'utf8'));
var html = compiled(templateData);
fs.writeFile(outputFile, html, callback);
}
Please help.
Use async.each for your use case
async.each(arrayOfData,
function(ele, next){
generateFileFromTemplate(ele,function(){});
},
function(err){
if(err) console.log('err', err);
sendNotification();
}
);
You can use a great utility library called Async, particularly its parallel method: https://github.com/caolan/async#parallel.
Here's an example:
var async = require('async');
/*-------------*/
var tasks = arrayOfData.map(function(data) {
return function(cb) {
generateFileFromTemplate(data,function(){});
cb(null);
}
});
async.parallel(tasks, function(err) {
console.log('My job is done');
})

Node JS Promises not working on local server

I created "small" server that shoud use NodeJs and promises to copy and compres files of given type from given directory. It had to be separeted, each in promise - reading files and packing them. First I created just those promises to check if they would work, and they did:
function packFile(file){
var fileName = path.basename(file);
console.log(' (+) Add to the archive : ' + fileName + ' ['+file+']');
archive.append(fs.createReadStream(file), { name:fileName });
}
Q.nfcall(recursive, pathGiven, filterGiven)
.then(function(data) {
var chain = Q.when();
for(var i = 0; i < data.length; i++){
console.log(' > '+data[i]+' > adding to chain ...');
chain = chain.then(packFile.bind(this, data[i]));
}
chain.then(function() {
archive.finalize();
});
Ofc, i added, in front of previouslty presented code:
fs = require('fs');
archiver = require('archiver');
Q = require('q');
recursive = require('recursive-readdir');
path = require('path');
All those were isntalled and working fine. "archive" is also corrently initialized at the beggining.
Since all worked- it was time to get on with a server:
//Lets require/import the HTTP module
var http = require('http');
//Lets define a port we want to listen to
const PORT=8080;
//Create a server
var server = http.createServer(handleRequest);
//Lets start our server
server.listen(PORT, function(){
console.log("Server listening on: http://localhost:%s", PORT);
});
Function "handleRequest" is quite big, so to get just the gist of it i will place part where is shoud handle those promises:
switch(request.url) {
case '/formhandler':
if (request.method == 'POST') {
// (...) Got my "Sour" (Path to source directory ) and "Mask"
// Here i tried to place that promise chain, but server just skipped it and continued
Q.nfcall(recursive, Sour, Mask)
.then(function(data) {
var chain = Q.when();
for(var i = 0; i < data.length; i++){
console.log(' > '+data[i]+' > adding to chain ...');
chain = chain.then(packFile.bind(this, data[i]));
}
chain.then(function() {
console.log("whole chain resolved");
console.log('FINISHING');
archive.finalize();
});
});
// (...) Finishing this request
}
break;
}
Nothing I do seems to work. As long as I use those promises (calling them like that or in some function). I have no idea why.
EDIT 1
What i mean by skipped - I placed 2 console.log before and after promises in that handler. I shoud see:
LOG1
TEXT_FROM_PROMISES //Something those promises send to console.log
LOG2
But there was only:
LOG1
LOG2
And server was ready to get another request.
EDIT 2
Some suggested in comments to change packFile so it would return promise, so i did:
function packFile(file){
return new Promise(function(resolve,reject){
var fileName = path.basename(file);
console.log(' (+) Adding to the archive : ' + fileName + ' ['+file+']');
archive.append(fs.createReadStream(file), { name:fileName });
});
}
Rest of the code is unchanged. Result is - Now ONLY FIRST from this chain will be performed (if I call it "alone" in script, without server) and it still wont work on server.
EDIT 3
I changed 'packFile' again:
function packFile(file){
return new Promise(function(resolve,reject){
var fileName = path.basename(file);
console.log(' (+) Adding to the archive : ' + fileName + ' ['+file+']');
archive.append(fs.createReadStream(file), { name:fileName });
resolve("Success!");
});
}
Not whole chain will work - for no-server version, but still - nothing happen on server.
EDIT 4
I changed
Q.nfcall(recursive, pathGiven, filterGiven)
.then(function(data) {
var chain = Q.when();
for(var i = 0; i < data.length; i++){
console.log(' > '+data[i]+' > adding to chain ...');
chain = chain.then(packFile.bind(this, data[i]));
}
chain.then(function() {
console.log("whole chain resolved");
archive.finalize();
});
}, function(reason) {
console.log("ERROR:"+reason); // Error!
});
and got some new error in console:
ERROR:TypeError: ignores.map is not a function
EDIT 5
Code I use for archive, just to make and open .zip to add files to with my 'packFile'
archive = archiver.create('zip', {});
var output = fs.createWriteStream('./output.zip');
archive.pipe(output);
EDIT 6
Since there seems to be need for more code, tehre is function I call on my server whenever i recive proper request (-> check beggining of this post)
function handleWithPromises(pathGiven, filterGiven){
console.log(' > Promising ...');
archive = archiver.create('zip', {});
var output = fs.createWriteStream('./output.zip');
archive.pipe(output);
function packFile(file){
return new Promise(function(resolve,reject){
var fileName = path.basename(file);
console.log(' (+) Adding to the archive : ' + fileName + ' ['+file+']');
archive.append(fs.createReadStream(file), { name:fileName });
resolve("Success!");
});
}
Q.nfcall(recursive, pathGiven, filterGiven)
.then(function(data) {
var chain = Q.when();
for(var i = 0; i < data.length; i++){
console.log(' > '+data[i]+' > adding to chain ...');
chain = chain.then(packFile.bind(this, data[i]));
}
chain.then(function() {
console.log('FINISHING');
archive.finalize();
});
});
}
and code that is placed at the very begging of script:
fs = require('fs');
archiver = require('archiver');
Q = require('q');
recursive = require('recursive-readdir');
path = require('path');
Go back to Edit 1 where you expected to see
LOG1
TEXT_FROM_PROMISES //Something those promises send to console.log
LOG2
but got
LOG1
LOG2
An explanation is that A+ compliant promises interface with the event loop scheduler to execute functions supplied with then in their own thread, after the thread that called then executes to completion.
Chained promises execute asynchronously to the code that defines them.
On a very simplistic level, you will need to postpone ending the request until all files have been packed by making sure response is in function scope of the last anonymous function and ending the response there:
chain.then(function() {
console.log('FINISHING');
archive.finalize();
response.write(...) // are you sending the archive back?
response.end(); // now end the request
});

Node.js Download File Using Content Disposition as Filename

I'm using the Request module to download files, but I'm not quite sure how to pipe the response to an output stream when the filename must come from the 'Content-Disposition' header. So basically, I need to read the response until the header is found, and then pipe the rest to that filename.
The examples show something like:
request('http://google.com/doodle.png').pipe(fs.createWriteStream('doodle.png'));
Where I want to do (pseudocode):
var req = request('http://example.com/download_latest_version?token=XXX');
var filename = req.response.headers['Content-Disposition'];
req.pipe(fs.createWriteStream(filename));
I could get the filename using the Request callback:
request(url, function(err, res, body) {
// get res headers here
});
But wouldn't that negate the benefits of using pipe and not loading the downloaded file into memory?
I'm reqesting a image from yahoo and it isn't using the content-disposition header but I am extracting the date and content-type headers to construct a filename. This seems close enough to what you're trying to do...
var request = require('request'),
fs = require('fs');
var url2 = 'http://l4.yimg.com/nn/fp/rsz/112113/images/smush/aaroncarter_635x250_1385060042.jpg';
var r = request(url2);
r.on('response', function (res) {
res.pipe(fs.createWriteStream('./' + res.headers.date + '.' + res.headers['content-type'].split('/')[1]));
});
Ignore my image choice please :)
Question has been around a while, but I today faced the same problem and solved it differently:
var Request = require( 'request' ),
Fs = require( 'fs' );
// RegExp to extract the filename from Content-Disposition
var regexp = /filename=\"(.*)\"/gi;
// initiate the download
var req = Request.get( 'url.to/somewhere' )
.on( 'response', function( res ){
// extract filename
var filename = regexp.exec( res.headers['content-disposition'] )[1];
// create file write stream
var fws = Fs.createWriteStream( '/some/path/' + filename );
// setup piping
res.pipe( fws );
res.on( 'end', function(){
// go on with processing
});
});
Here's my solution:
var fs = require('fs');
var request = require('request');
var through2 = require('through2');
var req = request(url);
req.on('error', function (e) {
// Handle connection errors
console.log(e);
});
var bufferedResponse = req.pipe(through2(function (chunk, enc, callback) {
this.push(chunk);
callback()
}));
req.on('response', function (res) {
if (res.statusCode === 200) {
try {
var contentDisposition = res.headers['content-disposition'];
var match = contentDisposition && contentDisposition.match(/(filename=|filename\*='')(.*)$/);
var filename = match && match[2] || 'default-filename.out';
var dest = fs.createWriteStream(filename);
dest.on('error', function (e) {
// Handle write errors
console.log(e);
});
dest.on('finish', function () {
// The file has been downloaded
console.log('Downloaded ' + filename);
});
bufferedResponse.pipe(dest);
} catch (e) {
// Handle request errors
console.log(e);
}
}
else {
// Handle HTTP server errors
console.log(res.statusCode);
}
});
The other solutions posted here use res.pipe, which can fail if the content is transferred using gzip encoding, because the response stream contains the raw (compressed) HTTP data. To avoid this problem you have to use request.pipe instead. (See the second example at https://github.com/request/request#examples.)
When using request.pipe I was getting an error: "You cannot pipe after data has been emitted from the response.", because I was doing some async stuff before actually piping (creating a directory to hold the downloaded file). I also had some problems where the file was being written with no content, which might have been due to request reading the HTTP response and buffering it.
So I ended up creating an intermediate buffering stream with through2, so that I could pipe the request to it before the response handler fires, then later piping from the buffering stream into the file stream once the filename is known.
Finally, I'm parsing the content disposition header whether the filename is encoded in plain form or in UTF-8 form using the filename*=''file.txt syntax.
I hope this helps someone else who experiences the same issues that I had.

Piping multiple file streams using Node.js

I want to stream multiple files, one after each other, to the browser. To illustrate, think of having multiple CSS files which shall be delivered concatenated as one.
The code I am using is:
var directory = path.join(__dirname, 'css');
fs.readdir(directory, function (err, files) {
async.eachSeries(files, function (file, callback) {
if (!endsWith(file, '.css')) { return callback(); } // (1)
var currentFile = path.join(directory, file);
fs.stat(currentFile, function (err, stats) {
if (stats.isDirectory()) { return callback(); } // (2)
var stream = fs.createReadStream(currentFile).on('end', function () {
callback(); // (3)
});
stream.pipe(res, { end: false }); // (4)
});
}, function () {
res.end(); // (5)
});
});
The idea is that I
filter out all files that do not have the file extension .css.
filter out all directories.
proceed with the next file once a file has been read completely.
pipe each file to the response stream without closing it.
end the response stream once all files have been piped.
The problem is that only the first .css file gets piped, and all remaining files are missing. It's as if (3) would directly jump to (5) after the first (4).
The interesting thing is that if I replace line (4) with
stream.on('data', function (data) {
console.log(data.toString('utf8'));
});
everything works as expected: I see multiple files. If I then change this code to
stream.on('data', function (data) {
res.write(data.toString('utf8'));
});
all files expect the first are missing again.
What am I doing wrong?
PS: The error happens using Node.js 0.8.7 as well as using 0.8.22.
UPDATE
Okay, it works if you change the code as follows:
var directory = path.join(__dirname, 'css');
fs.readdir(directory, function (err, files) {
var concatenated = '';
async.eachSeries(files, function (file, callback) {
if (!endsWith(file, '.css')) { return callback(); }
var currentFile = path.join(directory, file);
fs.stat(currentFile, function (err, stats) {
if (stats.isDirectory()) { return callback(); }
var stream = fs.createReadStream(currentFile).on('end', function () {
callback();
}).on('data', function (data) { concatenated += data.toString('utf8'); });
});
}, function () {
res.write(concatenated);
res.end();
});
});
But: Why? Why can't I call res.write multiple times instead of first summing up all the chunks, and then write them all at once?
Consider also using multistream, that allows you to combine and emit multiple streams one after another.
The code was perfectly fine, it was the unit test that was wrong ...
Fixed that, and now it works like a charme :-)
May help someone else:
const fs = require("fs");
const pth = require("path");
let readerStream1 = fs.createReadStream(pth.join(__dirname, "a.txt"));
let readerStream2 = fs.createReadStream(pth.join(__dirname, "b.txt"));
let writerStream = fs.createWriteStream(pth.join(__dirname, "c.txt"));
//only readable streams have "pipe" method
readerStream1.pipe(writerStream);
readerStream2.pipe(writerStream);
I also checked Rocco's answer and its working like a charm:
//npm i --save multistream
const multi = require('multistream');
const fs = require('fs');
const pth = require("path");
let streams = [
fs.createReadStream(pth.join(__dirname, "a.txt")),
fs.createReadStream(pth.join(__dirname, "b.txt"))
];
let writerStream = fs.createWriteStream(pth.join(__dirname, "c.txt"));
//new multi(streams).pipe(process.stdout);
new multi(streams).pipe(writerStream);
and to send the results to client:
const multi = require('multistream');
const fs = require('fs');
const pth = require("path");
const exp = require("express");
const app = exp();
app.listen(3000);
app.get("/stream", (q, r) => {
new multi([
fs.createReadStream(pth.join(__dirname, "a.txt")),
fs.createReadStream(pth.join(__dirname, "b.txt"))
]).pipe(r);
});

Resources