Request makes process out of memory, when downloading big media files - node.js

I wrote a simple script to download video files from a CDN, where the direct URLs are simple to generate, for example http://something.com/N.mp4, where N is a number.
The problem is, when downloading files with larger than ~300MB, the files appears perfectly in hard drive, but before the request(...)'s callback, a memory allocation fail happens:
FATAL ERROR: CALL_AND_RETRY_0 Allocation failed - process out of memory
Does this happens because of some serious bad practice? Can request download media files, with this size?
Environment: Win7, 4GB+ free RAM, Node v0.10.31
var request = require('request');
var async = require('async');
var fs = require('fs');
var start = +process.argv[2] || 1;
var end = +process.argv[3] || 50;
var url = 'http://something.com/';
try {
fs.mkdirSync(__dirname + '/videos/');
} catch (e) {}
var index = start;
async.whilst(
function () { return index <= end; },
function (callback) {
var fileName = index + '.mp4';
console.log('Started: ' + fileName);
console.time('Done (' + fileName + ')');
request(url + fileName, function() {
console.timeEnd('Done (' + fileName + ')');
index++;
callback(null);
}).pipe(fs.createWriteStream(__dirname + '/videos/' + fileName));
},
function (err) {
if (err) {
return console.error(err);
}
console.log('Script finished.');
}
);
Example console output:
> node index.js 3
Started: 3.mp4
Done (3.mp4): 296592ms
Started: 4.mp4
Done (4.mp4): 369718ms
Started: 5.mp4
FATAL ERROR: CALL_AND_RETRY_0 Allocation failed - process out of memory

If you use request module with a callback it buffers the whole response body in memory. Try omitting callback and using finish event of fs stream instead.
var writer = fs.createWriteStream(__dirname + '/videos/' + fileName);
writer.on('finish', function() {
// ...
index++;
callback(null);
});
request(url + fileName).pipe(writer);

It looks like you're trying to download videos 3 to 50 all in parallel, so that might be what's causing you to run out of memory. You could try doing them in series and see if that fixes the problem. With async.waterfall your code might look something like this:
var tasks = [];
for (; index < end; index++) {
tasks.push(function(callback) {
var fileName = index + '.mp4';
console.log('Started: ' + fileName);
console.time('Done (' + fileName + ')');
request(url + fileName, function() {
console.timeEnd('Done (' + fileName + ')');
callback(null);
}).pipe(fs.createWriteStream(__dirname + '/videos/' + fileName));
});
}
async.waterfall(tasks, function(err) {
if (err) {
return console.error(err);
}
console.log('Script finished.');
});

Related

How to abort a request?

I'm writing an upload script. If there's an error while writing to disk I want to abort the request and return a 500 status code.
My code (below) sends a 500 as expected, but the upload doesn't stop. I keep getting "data" events until the upload is complete (even though my write pipe is already broken) and then the req.on('end' event fires which tries to send a 204 even though I've already sent a 500.
How can I abort the request so that I stop getting event notifications?
var filename = req.headers['wx-filename'];
var docType = req.headers['wx-doc-type'];
var clientId = req.headers['wx-client-id'];
var dir = Path.join(config.storage_path, docType, clientId);
var filePath = Path.join(dir, filename);
mkdirp.sync(dir);
var destFile = FileSystem.createWriteStream(filePath, {
flags: 'wx' // don't overwrite files
});
if (config.env === 'dev') {
(function () {
var fileSize = req.headers['content-length'];
var uploadedBytes = 0;
req.on('data', function (data) {
uploadedBytes += data.length;
var p = uploadedBytes / fileSize * 100;
var fn = Path.relative(config.storage_path, filePath);
util.log(fn + ': ' + p.toFixed(1) + '%');
});
})();
}
req.on('end', function () {
util.log("END");
resp.writeHead(204);
resp.end();
});
destFile.on('error', function (err) {
util.log("ERROR", err);
resp.writeHead(500, err.code);
resp.end();
// HOW TO STOP REQUEST?
});
req.pipe(destFile);
You need to remove the listeners for data and end, after that send a Connection: close header and at the end send the 500 error.

Nodejs download multiple files

I need to download ~26k images. The images list and urls are stored in csv file. Im reading the csv file and trying to download the images while looping through the list.
If im using small set ~1-2k it works fine but when i switch to the full set im getting EMFILE error.
Error: EMFILE, open 'S:\images_download\Images\189900008.jpg'
I've noticed that node tries to create all the files at once and this might be the issue but i'm unable to force it to create it one by one. My understanding is the code below should work like this but obviously is not.
(Just to mention that this code is executed on Windows)
Code:
var csv = require("fast-csv");
var fs = require('fs');
var request = require('request');
var async = require('async');
fs.writeFile('errors.txt', '', function(){})
var downloaded = 0;
var totalImages = 0;
var files = [];
csv
.fromPath("Device_Images_List.csv")
.on("data", function(data){
files.push({device: data[0], url: data[1]})
})
.on("end", function(){
totalImages = files.length;
async.each(files, function(file, callback) {
var deviceId = file.device;
var deviceUrl = file.url;
if ( deviceId != 'DEVICE_TYPE_KEY' ) {
try {
writeStream = fs.createWriteStream('./Images/' + deviceId + '.jpg');
proxiedRequest = request.defaults({proxy: "http://proxy:8080"});
proxiedRequest(deviceUrl).pipe(writeStream);
writeStream.on('open', function(fd) {
var rem = proxiedRequest.get(deviceUrl);
rem.on('data', function(chunk) {
writeStream.write(chunk);
});
rem.on('end', function() {
downloaded++;
console.log('Downloaded: ' + deviceId + '; ' + (downloaded + 1) + ' of ' + totalImages);
writeStream.end();
});
});
writeStream.on('close', function(){
callback();
});
} catch (ex) {
fs.appendFile('errors.txt', deviceId + ' failed to download', function (err) {
callback();
});
}
}
}, function(err){
if( err ) {
console.log(err);
} else {
}
});
});
As #slebetman commented the issue can be solved by using async.eachSeries to process the files one by one or async.eachLimit to limit the parallel nodes:
async.eachLimit(files, 5, function(file, callback) {
// ... Process 5 files at the same time
}, function(err){
});

Exit Node Process After Successful fs.appendFile

I'm having trouble create processes in parallel with Node while exiting when they're done with a simple HTTP GET request. I've noticed that if I fire a process.exit() inside of a callback for appendFile, some files will not be created or appended in a Node cluster setup. Ideally, the way below is how I would like to fire events since the process is exited as soon as the job is done:
var rp = require("request-promise");
config = require("./config"),
cluster = require("cluster"),
os = require("os"),
fs = require("fs");
var keywordArray = [
'keyword1',
'keyword2',
...
];
if (cluster.isMaster) {
var numCPUs = os.cpus().length;
var clusterDivision = Math.ceil(keywordArray.length/numCPUs);
// Reset the json if previously set
keywordArray.forEach(function(arrayItem) {
fs.unlink(config.dataDirectory + arrayItem + '.json', function(err) {
if (err) console.error(err);
console.log('successfully unlinked ' + arrayItem + '.json from ' + config.dataDirectory);
});
});
// Create a worker for each CPU
// Seperate the array out evenly for each worker
for (var j=1;j<=numCPUs;j++) {
var tempArray = [];
var removed = keywordArray.splice(0, clusterDivision);
if (removed.length > 0) {
// The array contains something so let's do something with the keyword
console.log('creating a worker');
cluster.fork().send(removed);
} else {
// We don't need a cluster here
}
}
process.on('exit', function() {
console.log('exited');
});
} else if (cluster.isWorker) {
// Code to run if we're in a worker process
// Send the object we created above from variables so they're available to the workers
process.on('message', function(seperatedArrayItem) {
seperatedArrayItem.forEach(function(arrayItem) {
function radarRequest(err, response, body) {
var responseBody = JSON.parse(body);
console.log(arrayItem);
fs.appendFileSync(config.dataDirectory + arrayItem + '.json', JSON.stringify(responseBody.results, null, '\t'), function (err) {
if (err) console.err(err);
console.log('success writing file');
});
}
rp({
url: config.radarSearchURI +
'?key='+ config.apiKey +
'&location=' + config.latitude + ',' + config.longitude +
'&radius=' + config.searchRadius +
'&keyword=' + arrayItem, headers: config.headers
}, radarRequest);
});
setTimeout(function() {
process.exit(0);
}, 5000);
});
}
The only way I can make sure all files are properly appended is by using a Timeout, which is exactly what I don't want to - and shouldn't - do. Is there another way I can ensure an appendFile has happened successfully and then kill the node process? Here's a way that works (assuming the process doesn't take longer than 5 seconds):
process.on('message', function(seperatedArrayItem) {
seperatedArrayItem.forEach(function(arrayItem) {
function radarRequest(err, response, body) {
var responseBody = JSON.parse(body);
console.log(arrayItem);
fs.appendFile(config.dataDirectory + arrayItem + '.json', JSON.stringify(responseBody.results, null, '\t'), function (err) {
if (err) console.err(err)
console.log('success writing file');
});
}
rp({
url: config.radarSearchURI +
'?key='+ config.apiKey +
'&location=' + config.latitude + ',' + config.longitude +
'&radius=' + config.searchRadius +
'&keyword=' + arrayItem, headers: config.headers
}, radarRequest);
});
setTimeout(function() {
process.exit(0);
}, 5000);
});
You can use an async flow control module like async to kill the process after all files are written. I'd also recomment cluster.worker.disconnect() so that the node process will simple exit gracefully, but that isn't a requirement.
async.forEach(seperatedArrayItem, function(item, done){
// append file and call 'done' when it is written.
}, function(){
// Will be called when all item 'done' functions have been called.
cluster.worker.disconnect();
});
Node fs.appendFile( ... ) is an asynchronous function. So it expects us to pass a callback for we know it has finished its main operation, to inform us of some error occurred, or another purpose.
This means we need to call Node process.exit( ... ) in the scope of the provided callback. I've written this code to test:
'use strict';
var fs = require('fs');
function jsonValue(obj) {
return JSON.stringify(obj, null, '\t');
}
fs.appendFile('file.json', jsonValue(['t', 'e', 's', 't']), function(error) {
if (error) {
throw error;
}
console.log('success writing file'); // no error, so log...
process.exit(); // and exit right now
console.log('exited?'); // this will not be printed
});
Well, it worked as defined.
Other way it works is to use the synchronous version of fs.appendFile( ... ) and call process.exit() in a sequential way:
fs.appendFileSync('file.json', jsonValue(['t', 'e', 's', 't']));
console.log('success writing file'); // no error (I hope so =), so log...
process.exit(); // and exit right now
console.log('exited?'); // this will not be printed
This is clean code and works, but you lose the robustness and convenience gained with the callback...

"Unhandled stream error in pipe" - Too many file downloads in Node.js / request?

I am trying to download many (around 2,000) images from a JSON feed using Node, (and specifically the request module). When I try to do this (looping through the JSON) I get
throw er; // Unhandled stream error in pipe.
I checked ulimit -n and it was set at 256, so I increased that to 4,000 and I still get the same error (although after I am downloading a much higher number of images).
I have two questions,
Why am I still getting an error if I raised the maximum download number well in excess of the number of simultaneous downloads I actually have
What is the best way to "queue" or pause the downloads so as not to overwhelm my system? Here is my code.
var fs = require('fs')
, http = require('http')
, request = require('request')
, url = 'http://www.urlOfJsonFeed'
function get(){
http.get(url, function(res) {
var body = '';
res.on('data', function(chunk) {
body += chunk;
});
res.on('end', function() {
jParse(body);
});
}).on('error', function(e) {
console.log("Got error: ", e);
});
}
function jParse(info){
data = JSON.parse(info)
entries = data.entries
numToDownload = entries.length;
for(var i = numToDownload - 1; i >= 0; i -= 1){
link = entries[i]['imgUrl']
download(link, 'images/' + mov + '.mp4', function(){
console.log('Downloaded ' + dMov + ' movies')
dMov++
}
}
}
var download = function(uri, filename, callback){
request.head(uri, function(err, res, body){
if (err) {
console.log('header error');
}
if (!err && res.statusCode == 200) {
//Download the image
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
}
});
};
get()

Nodejs CSV data export system for users

I need to allow users to export their data in csv format. I have written app in nodejs. The export data for users can be huge. So i was wondering How to handle such situation in nodejs. Should i user process.nexttick or child process api of nodejs? Also are there any good module available for nodejs to convert data from mysql to csv.
read line by line from your mysql-db, and append line by line to your file
i dont know that much about the mysqlmodule, so i'm assuming here each line is just an array, therefore the 'row.join(';')'. if thats not the case (maybe its an object), you should fix that.
var fs = require('fs');
var connection = require('mysql').createConnection({yourdbsettingshere});
function processRow (row) {
fs.appendFile('your-file.csv', row.join(';'), function (err) {
connection.resume();
});
}
var query = connection.query('SELECT * FROM WHATEVER');
query
.on('error', function(err) {
// do something when an error happens
})
.on('fields', function(fields) {
processRow(fields);
})
.on('result', function(row) {
// Pausing the connnection is useful if your processing involves I/O
connection.pause();
processRow(row, function (err) {
connection.resume();
});
})
.on('end', function() {
// now you can mail your user
});
if you have a lot of requests, you could use the compute-cluster module for distributing your workload
The accepted answer is not working because CSV files are separated by , not ;. Also there is no newline character \n after the end of each row and the fields object contains information about the column attributes not the data rows. results contains the rows resulted from query. Hence I wrote my own code for generating CSV files. If you need more explanation then please comment, I will provide.
pool.query('SELECT * FROM category', function (error, results, fields) {
var reportFile = Date.now();
fs.closeSync(fs.openSync(__dirname + '/../reports/' + reportFile + '.csv', 'w'));
var attributes = [];
var row = [];
for(var x = 0; x<fields.length; x++) attributes.push(fields[x].name);
fs.appendFile(__dirname + '/../reports/' + reportFile + '.csv', attributes.join(','), function (err) {
if(err) console.log('Error appending fields', err);
fs.appendFileSync(__dirname + '/../reports/' + reportFile + '.csv', '\n');
for(var x = 0; x<results.length; x++) {
row = [];
for(var y = 0; y<attributes.length; y++){
row.push(results[x][attributes[y]]);
}
fs.appendFileSync(__dirname + '/../reports/' + reportFile + '.csv', row.join(','));
fs.appendFileSync(__dirname + '/../reports/' + reportFile + '.csv', '\n');
}
req.reportFile = reportFile;
next();
});
});

Resources