Nodejs download multiple files - node.js

I need to download ~26k images. The images list and urls are stored in csv file. Im reading the csv file and trying to download the images while looping through the list.
If im using small set ~1-2k it works fine but when i switch to the full set im getting EMFILE error.
Error: EMFILE, open 'S:\images_download\Images\189900008.jpg'
I've noticed that node tries to create all the files at once and this might be the issue but i'm unable to force it to create it one by one. My understanding is the code below should work like this but obviously is not.
(Just to mention that this code is executed on Windows)
Code:
var csv = require("fast-csv");
var fs = require('fs');
var request = require('request');
var async = require('async');
fs.writeFile('errors.txt', '', function(){})
var downloaded = 0;
var totalImages = 0;
var files = [];
csv
.fromPath("Device_Images_List.csv")
.on("data", function(data){
files.push({device: data[0], url: data[1]})
})
.on("end", function(){
totalImages = files.length;
async.each(files, function(file, callback) {
var deviceId = file.device;
var deviceUrl = file.url;
if ( deviceId != 'DEVICE_TYPE_KEY' ) {
try {
writeStream = fs.createWriteStream('./Images/' + deviceId + '.jpg');
proxiedRequest = request.defaults({proxy: "http://proxy:8080"});
proxiedRequest(deviceUrl).pipe(writeStream);
writeStream.on('open', function(fd) {
var rem = proxiedRequest.get(deviceUrl);
rem.on('data', function(chunk) {
writeStream.write(chunk);
});
rem.on('end', function() {
downloaded++;
console.log('Downloaded: ' + deviceId + '; ' + (downloaded + 1) + ' of ' + totalImages);
writeStream.end();
});
});
writeStream.on('close', function(){
callback();
});
} catch (ex) {
fs.appendFile('errors.txt', deviceId + ' failed to download', function (err) {
callback();
});
}
}
}, function(err){
if( err ) {
console.log(err);
} else {
}
});
});

As #slebetman commented the issue can be solved by using async.eachSeries to process the files one by one or async.eachLimit to limit the parallel nodes:
async.eachLimit(files, 5, function(file, callback) {
// ... Process 5 files at the same time
}, function(err){
});

Related

Node.js function not running in order. Error: Unhandled stream error in pipe

I updated the function to create the CSV file but now I'm getting an error:
In upload function
internal/streams/legacy.js:57
throw er; // Unhandled stream error in pipe.
^
Error: ENOENT: no such file or directory, open 'C:\Users\shiv\WebstormProjects\slackAPIProject\billingData\CSV\1548963844106output.csv'
var csvFilePath = '';
var JSONFilePath = '';
function sendBillingData(){
var message = '';
axios.get(url, {
params: {
token: myToken
}
}).then(function (response) {
message = response.data;
fields = billingDataFields;
// saveFiles(message, fields, 'billingData/');
saveFilesNew(message, fields, 'billingData/');
var file = fs.createReadStream(__dirname + '/' + csvFilePath); // <--make sure this path is correct
console.log(__dirname + '/' + csvFilePath);
uploadFile(file);
})
.catch(function (error) {
console.log(error);
});
}
The saveFilesNew function is:
function saveFilesNew(message, options, folder){
try {
const passedData = message;
var relevantData='';
if (folder == 'accessLogs/'){
const loginsJSON = message.logins;
relevantData = loginsJSON;
console.log(loginsJSON);
}
if(folder == 'billingData/'){
relevantData = passedData.members;
const profile = passedData.members[0].profile;
}
//Save JSON to the output folder
var date = Date.now();
var directoryPath = folder + 'JSON/' + date + "output";
JSONFilePath = directoryPath + '.json';
fs.writeFileSync(JSONFilePath, JSON.stringify(message, null, 4), function(err) {
if (err) {
console.log(err);
}
});
//parse JSON onto the CSV
const json2csvParser = new Json2csvParser({ fields });
const csv = json2csvParser.parse(relevantData);
// console.log(csv);
//function to process the CSV onto the file
var directoryPath = folder + 'CSV/' + date + "output";
csvFilePath = directoryPath + '.csv';
let data = [];
let columns = {
real_name: 'real_name',
display_name: 'display_name',
email: 'email',
account_type: 'account_type'
};
var id = passedData.members[0].real_name;
console.log(id);
console.log("messageLength is" +Object.keys(message.members).length);
for (var i = 0; i < Object.keys(message.members).length; i++) {
console.log("value of i is" + i);
var display_name = passedData.members[i].profile.display_name;
var real_name = passedData.members[i].profile.real_name_normalized;
var email = passedData.members[i].profile.email;
var account_type = 'undefined';
console.log("name: " + real_name);
if(passedData.members[i].is_owner){
account_type = 'Org Owner';
}
else if(passedData.members[i].is_admin){
account_type = 'Org Admin';
}
else if(passedData.members[i].is_bot){
account_type = 'Bot'
}
else account_type = 'User';
data.push([real_name, display_name, email, account_type]);
}
console.log(data);
stringify(data, { header: true, columns: columns }, (err, output) => {
if (err) throw err;
fs.writeFileSync(csvFilePath, output, function(err) {
console.log(output);
if (err) {
console.log(err);
}
console.log('my.csv saved.');
});
});
} catch (err) {
console.error(err);
}
}
The upload file function is:
function uploadFile(file){
console.log("In upload function");
const form = new FormData();
form.append('token', botToken);
form.append('channels', 'testing');
form.append('file', file);
axios.post('https://slack.com/api/files.upload', form, {
headers: form.getHeaders()
}).then(function (response) {
var serverMessage = response.data;
console.log(serverMessage);
});
}
So I think the error is getting caused because node is trying to upload the file before its being created. I feel like this has something to do with the asynchronous nature of Node.js but I fail to comprehend how to rectify the code. Please let me know how to correct this and mention any improvements to the code structure/design too.
Thanks!
You don't wait for the callback provided to stringify to be executed, and it's where you create the file. (Assuming this stringify function really does acccept a callback.)
Using callbacks (you can make this cleaner with promises and these neat async/await controls, but let's just stick to callbacks here), it should be more like:
function sendBillingData() {
...
// this callback we'll use to know when the file writing is done, and to get the file path
saveFilesNew(message, fields, 'billingData/', function(err, csvFilePathArgument) {
// this we will execute when saveFilesNew calls it, not when saveFilesNew returns, see below
uploadFile(fs.createReadStream(__dirname + '/' + csvFilePathArgument))
});
}
// let's name this callback... "callback".
function saveFilesNew(message, options, folder, callback) {
...
var csvFilePath = ...; // local variable only instead of your global
...
stringify(data, { header: true, columns: columns }, (err, output) => {
if (err) throw err; // or return callbcack(err);
fs.writeFile(csvFilePath , output, function(err) { // NOT writeFileSync, or no callback needed
console.log(output);
if (err) {
console.log(err);
// callback(err); may be a useful approach for error-handling at a higher level
}
console.log('my.csv saved.'); // yes, NOW the CSV is saved, not before this executes! Hence:
callback(null, csvFilePath); // no error, clean process, pass the file path
});
});
console.log("This line is executed before stringify's callback is called!");
return; // implicitly, yes, yet still synchronous and that's why your version crashes
}
Using callbacks that are called only when the expected events happen (a file is done writing, a buffer/string is done transforming...) allows JS to keep executing code in the meantime. And it does keep executing code, so when you need data from an async code, you need to tell JS you need it done before executing your piece.
Also, since you can pass data when calling back (it's just a function), here I could avoid relying on a global csvFilePath. Using higher level variables makes things monolithic, like you could not transfer saveFilesNew to a dedicated file where you keep your toolkit of file-related functions.
Finally, if your global process is like:
function aDayAtTheOffice() {
sendBillingData();
getCoffee();
}
then you don't need to wait for the billing data to be processed before starting making coffee. However, if your boss told you that you could NOT get a coffee until the billing data was settled, then your process would look like:
function aDayAtTheOffice() {
sendBillingData(function (err) {
// if (err) let's do nothing here: you wanted a coffee anyway, right?
getCoffee();
});
}
(Note that callbacks having potential error as first arg and data as second arg is a convention, nothing mandatory.)
IMHO you should read about scope (the argument callback could be accessed at a time where the call to saveFilesNew was already done and forgotten!), and about the asynchronous nature of No... JavaScript. ;) (Sorry, probably not the best links but they contain the meaningful keywords, and then Google is your buddy, your friend, your Big Brother.)

How to Synchronize the file writes in Node.Js

I am using the EJS compile to create notification templates and I would like to know how to write the file to the file system in parallel and send the notification once all the files are saved.
Please see the below code snippet which I used
var fs = require('fs');
var ejs = require('ejs');
var arrayOfData = [someData]; //Prepare data from database
//Iterate through the data
for (var i = 0; i < arrayOfData.length; i++) {
generateFileFromTemplate(arrayOfData[i],function(){});
}
function generateFileFromTemplate(templateData,callback)
{
var outputFile = fileData.Id + ".html";
var compiled = ejs.compile(fs.readFileSync('email-template.ejs', 'utf8'));
var html = compiled(templateData);
fs.writeFile(outputFile, html, callback);
}
Please help.
Use async.each for your use case
async.each(arrayOfData,
function(ele, next){
generateFileFromTemplate(ele,function(){});
},
function(err){
if(err) console.log('err', err);
sendNotification();
}
);
You can use a great utility library called Async, particularly its parallel method: https://github.com/caolan/async#parallel.
Here's an example:
var async = require('async');
/*-------------*/
var tasks = arrayOfData.map(function(data) {
return function(cb) {
generateFileFromTemplate(data,function(){});
cb(null);
}
});
async.parallel(tasks, function(err) {
console.log('My job is done');
})

Request makes process out of memory, when downloading big media files

I wrote a simple script to download video files from a CDN, where the direct URLs are simple to generate, for example http://something.com/N.mp4, where N is a number.
The problem is, when downloading files with larger than ~300MB, the files appears perfectly in hard drive, but before the request(...)'s callback, a memory allocation fail happens:
FATAL ERROR: CALL_AND_RETRY_0 Allocation failed - process out of memory
Does this happens because of some serious bad practice? Can request download media files, with this size?
Environment: Win7, 4GB+ free RAM, Node v0.10.31
var request = require('request');
var async = require('async');
var fs = require('fs');
var start = +process.argv[2] || 1;
var end = +process.argv[3] || 50;
var url = 'http://something.com/';
try {
fs.mkdirSync(__dirname + '/videos/');
} catch (e) {}
var index = start;
async.whilst(
function () { return index <= end; },
function (callback) {
var fileName = index + '.mp4';
console.log('Started: ' + fileName);
console.time('Done (' + fileName + ')');
request(url + fileName, function() {
console.timeEnd('Done (' + fileName + ')');
index++;
callback(null);
}).pipe(fs.createWriteStream(__dirname + '/videos/' + fileName));
},
function (err) {
if (err) {
return console.error(err);
}
console.log('Script finished.');
}
);
Example console output:
> node index.js 3
Started: 3.mp4
Done (3.mp4): 296592ms
Started: 4.mp4
Done (4.mp4): 369718ms
Started: 5.mp4
FATAL ERROR: CALL_AND_RETRY_0 Allocation failed - process out of memory
If you use request module with a callback it buffers the whole response body in memory. Try omitting callback and using finish event of fs stream instead.
var writer = fs.createWriteStream(__dirname + '/videos/' + fileName);
writer.on('finish', function() {
// ...
index++;
callback(null);
});
request(url + fileName).pipe(writer);
It looks like you're trying to download videos 3 to 50 all in parallel, so that might be what's causing you to run out of memory. You could try doing them in series and see if that fixes the problem. With async.waterfall your code might look something like this:
var tasks = [];
for (; index < end; index++) {
tasks.push(function(callback) {
var fileName = index + '.mp4';
console.log('Started: ' + fileName);
console.time('Done (' + fileName + ')');
request(url + fileName, function() {
console.timeEnd('Done (' + fileName + ')');
callback(null);
}).pipe(fs.createWriteStream(__dirname + '/videos/' + fileName));
});
}
async.waterfall(tasks, function(err) {
if (err) {
return console.error(err);
}
console.log('Script finished.');
});

"Unhandled stream error in pipe" - Too many file downloads in Node.js / request?

I am trying to download many (around 2,000) images from a JSON feed using Node, (and specifically the request module). When I try to do this (looping through the JSON) I get
throw er; // Unhandled stream error in pipe.
I checked ulimit -n and it was set at 256, so I increased that to 4,000 and I still get the same error (although after I am downloading a much higher number of images).
I have two questions,
Why am I still getting an error if I raised the maximum download number well in excess of the number of simultaneous downloads I actually have
What is the best way to "queue" or pause the downloads so as not to overwhelm my system? Here is my code.
var fs = require('fs')
, http = require('http')
, request = require('request')
, url = 'http://www.urlOfJsonFeed'
function get(){
http.get(url, function(res) {
var body = '';
res.on('data', function(chunk) {
body += chunk;
});
res.on('end', function() {
jParse(body);
});
}).on('error', function(e) {
console.log("Got error: ", e);
});
}
function jParse(info){
data = JSON.parse(info)
entries = data.entries
numToDownload = entries.length;
for(var i = numToDownload - 1; i >= 0; i -= 1){
link = entries[i]['imgUrl']
download(link, 'images/' + mov + '.mp4', function(){
console.log('Downloaded ' + dMov + ' movies')
dMov++
}
}
}
var download = function(uri, filename, callback){
request.head(uri, function(err, res, body){
if (err) {
console.log('header error');
}
if (!err && res.statusCode == 200) {
//Download the image
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
}
});
};
get()

read csv with headers then upload each row to couchdb using node/grunt

I would like to read a csv file and upload each row to a couchdb using a grunt task. At this point I am not yet doing any database validation such as checking if the record already exists but will have to do that at some point also.
Currently this is what I am doing and the problem is only the first 65 rows, of the first sub task named people is being uploaded to couchdb.
I know this has something to do with asynchronous execution but just can't work out how to do this
Gruntils.js
csv2couch: {
people: {
db: 'http://localhost:5984/db',
collectionName: 'person',
src:['./data/schema3/people.csv']
},
organisms: {
db: '<%= qmconfig.COUCHDBURL %>',
collectionName: 'organism',
src:['./data/schema3/organisms.csv']
}
}
csv2couch.js
'use strict';
var nanolib = require('nano'),
csv = require('csv'),
urls = require('url'),
fs = require('fs');
module.exports = function(grunt) {
grunt.registerMultiTask('csv2couch', 'Parse csv file and upload data to couchdb.', function() {
var done, parts, dbname, _this, collectionName;
_this = this;
done = this.async();
parts = urls.parse(this.data.db);
dbname = parts.pathname.replace(/^\//, '');
collectionName = this.data.collectionName;
// Merge task-specific and/or target-specific options with these defaults.
var options = this.options({});
// couchdb connection
try {
var nano = nanolib(parts.protocol + '//' + parts.host);
} catch (e) {
grunt.warn(e);
done(e, null);
}
// database connection
var db = nano.use(dbname);
// process each source csv file
this.filesSrc.forEach(function(f) {
console.log('source file:', f);
csv()
.from.path(f, {
columns:true,
delimeter:',',
quote:'"'
})
.on('record', function(row,index){
console.log('#'+index, row);
save(row, collectionName);
})
.on('end', function(count){
console.log('Number of lines: '+count);
done();
})
.on('error', function(error){
console.log(error.message);
done(error);
});
});
function save (data, collectionName) {
// document ID is concatenation of collectionName and ID
var docID = collectionName[0]+'_'+data.ID;
// add some additional data
data.type = collectionName;
// insert data into couchdb
db.insert(data, docID, function(err, body, header) {
if (err) {
console.log('[db.insert] ', err.message);
return;
}
});
}
});
};
You're right, the async code is incorrect. The CSV file is being read to the end before all your records are saved. You need to call done only when your last record has been saved.
Your save method needs to take a callback
var rowsRead = 0, // the number of rows read from the csv file
rowsWritten = 0; // the number of rows written to CouchdDb
caller:
.on('record', function(row,index){
rowsRead++;
save(row, collectionName, function(err){
if(err){
return done(err);
}
rowsWritten++;
if(rowsRead===rowsWritten){ // check if we've written all records to CouchDb
done();
}
});
})
save method:
function save (data, collectionName, callback) {
// document ID is concatenation of collectionName and ID
var docID = collectionName[0]+'_'+data.ID;
// add some additional data
data.type = collectionName;
// insert data into couchdb
db.insert(data, docID, callback);
}

Resources