Optimal design pattern - functions which process multiple files - node.js

Goal is to create distinct functions which separate out the work of loading multiple (xml) files and parsing them. I could do this all in one function, but the nested callbacks begin to get ugly. In other words, I don't want to do this:
// Explore directory
fs.readdir(path, function (err, files) {
if(err) throw err;
// touch each file
files.forEach(function(file) {
fs.readFile(path+file, function(err, data) {
if (err) throw err;
someAsyncFunction ( function (someAsyncFunctionResult) {
// Do some work, then call another async function...
nestedAsynchFunction ( function (nestedAsyncFunctionResult) {
// Do Final Work here, X levels deep. Ouch!
});
});
});
});
});
Instead, I want one function which reads my files and puts each file's XML payload into an array of objects which is returned to the caller (each object represents the name of the file and the XML in the file). Here's the function that might load up reports into an array:
function loadReports (callback) {
var path = "./downloaded-reports/";
var reports = [];
// There are TWO files in this path....
fs.readdir(path, function (err, files) {
if(err) throw err;
files.forEach(function(file) {
fs.readFile(path+file, function(err, data) {
if (err) throw err;
reports.push({ report: file, XML: data.toString()});
//gets called twice, which makes for strangeness in the calling function
callback(null, reports);
});
});
// callback won't work here, returns NULL reports b/c they haven't been processed yet
//callback(null, reports);
});
}
...and here's the function which will call the one above:
function parseReports() {
loadReports( function(err, data) {
console.log ("loadReports callback");
var reportXML = new jsxml.XML(data[0].XML);
var datasources = reportXML.child('datasources').child('datasource').child('connection').attribute("dbname").toString();
console.log(JSON.stringify(datasources,null, 2));
// More async about to be done below
} );
}
As you can see in the loadReports() comments, I can't get the callback to work right. It either calls back BEFORE the array is has been populated at all, or it calls back twice - once for each fs.readFile operation.
SO...what is the best way to deal with this sort of situation? In brief - What's the best design pattern for a function which processes multiple things asynchronously, so that it ONLY calls back when all "things" have been completely processed? The simpler the better. Do I need to use some sort of queuing module like Q or node-queue?
Thanks much!
Edit: Something like this works inside the deepest loop in terms of not hitting the callback twice, but it seems like a kludge:
fs.readdir(path, function (err, files) {
if(err) throw err;
files.forEach(function(file) {
fs.readFile(path+file, function(err, data) {
if (err) throw err;
reports.push({ report: file, XML: data.toString()});
// WORKS, but seems hacky.
if (reports.length = files.length) callback(null, reports);
});
});
});

Related

node.js passing value to a variable in async mode

My problem is the following:
I have a nice folder crawler function, which grabs the paths' of the files. I'm (I would like to) use these files for testing purposes.
1.) Grabbing the files
2.) Do some testing
3.) Job Done
This is the code where I call it:
walk(csvRoot, function(err, results){
if (err) throw err;
for (var i = 0; i < results.length; i++) {
return results[i] // - not working
}
});
My main issue is, that I really would like to pass the results to a variable, which will contain these paths as an array, but so far no luck.
The variable returns as undefined, that's what I'm trying to resolve currently.
Could you please advise how to do so?
Why do you use return within the for loop? What do you expect to return there? In any case, if you are expecting to have the results available outside of the scope of the walk function, it will not work. I pressume that you need something like that:
function getFiles (csvRoot, callback) {
walk(csvRoot, function (err, results) {
if (err) {
return callback(err);
}
return callback(null, results);
});
}
getFiles(csvRoot, functions (err, files) {
// #todo: check for error
console.log(files);
});

Async node.js data flow confusion

thanks for your help...struggling big time with how to handle this properly. I'm in async now, having given up on my ability to write the callbacks properly. I have snippet where I'm passing a set of random numbers (eachrecord) and passing them through to a mongoose call. Trying to create a data set from the multiple queries I pass.
My issue is that no matter what I've done for 4 hours, the "newarray" variable is always empty.
Thank you for your help -
async.forEach(arLimit, function(eachrecord, callback){
newarray = new Array;
var query = UGC_DB_Model.find({}).skip(eachrecord).limit(-1);
query.execFind(function (err, data) {
if (err)
console.log(err);
else {
newarray.push(data);
}
});
callback(null, newarray);
}, function(err, result) {
if (err) return next(err);
console.log("(it's empty): " + result);
});
There are several issues with your code:
async.forEach isn't meant to 'generate' results, that's what async.map is for;
you need to call the callback only when execFind is done, and not immediately after calling it;
your newarray is probably not necessary;
So try this instead:
async.map(arLimit, function(eachrecord, callback){
var query = UGC_DB_Model.find({}).skip(eachrecord).limit(-1);
query.execFind(function (err, data) {
if (err)
callback(err); // pass error along
else {
callback(null, [ data ]);
// although I think you mean this (because 'data' is probably an array already)
// callback(null, data);
}
});
}, function(err, result) {
if (err) return next(err);
console.log("(it's empty): " + result);
});

How to work with a large number of files and streams in Node.js?

I am working on a small node project which requires copying and ungzipping a number of files of various sizes. I've been trying to use async.eachSeries to take care of it, but it is not working out. The files are created but the pipe out the wr ends up writing to multiple different files regardless of which file it should end up in.
fs.readdir(path, function (err, files) {
async.eachSeries(files, function (file, callback) {
var wr = fs.createWriteStream(file);
fs.stat(file, function (err, stats) {
if (err) throw err;
var stream = fs.createReadStream(file).on('end', function () {
callback();
}).pipe(ungzip).pipe(wr);
});
}, function () {
//res.write(concatenated);
//res.end();
});
});
I'm still new to node so any help would be appreciated.
-NQ
Looks like the solution is to use a closure.
The problem in your code is that the callback function passed to fs.stat references a variable from outer scope, ie wr, which is changed in the next iteration of the loop. Closures are good to sort it out.
fs.readdir(path, function (err, files) {
async.eachSeries(files, function (file, callback) {
var wr = fs.createWriteStream(file);
fs.stat(file, function(myWr){
return function (err, stats) {
if (err) throw err;
var stream = fs.createReadStream(file).on('end', function () {
callback();
}).pipe(ungzip).pipe(myWr);
}
}(wr));
}, function () {
//res.write(concatenated);
//res.end();}
});
});
Refer Please explain the use of JavaScript closures in loops and Serving A Batch Of Dynamic Pages for more on closures.

using node.js async series, but doesn't work

I want to use async to make node.js work something in order, first query two rates from mongodb,then use these rates to compute two new rate:
async.series([
function(callback){
db.collection('heros',function(err,collection){
if(err){
console.log(err);
}
if(!err){
console.log("2 collection fetched!");
collection.findOne({'id':win},function(err,result){
if (err) throw err;
rate1=result.rate;
console.log("win-rate:"+rate1);
});
collection.findOne({'id':lose},function(err,result){
if (err) throw err;
rate2=result.rate;
console.log("lose-rate:"+rate2);
});
}
});
callback(null);
}, function(callback){
var Ea= 1/(1+Math.pow(10,(rate2-rate1)/400));
var Eb= 1/(1+Math.pow(10,(rate1-rate2)/400));
var ra= rate1+16*(1-Ea);
var rb= rate2+16*(0-Eb);
console.log("ra:"+ra);
console.log("rb:"+rb);
callback(null);
},
function(callback){
db.collection('heros',function(err,collection){
if(!err){
collection.update({'id':win},{$set: {rate:ra}},function(err,result){
if(err) throw err;
if(!err){
console.log("update successful");
}
});
collection.update({'id':lose},{$set:{rate:rb}},function(err,result){
if(err) throw err;
if(!err){
console.log("update successful");
}
});
}
});
callback(null);
}
]);
but when I run it, it shows error messages:
ReferenceError: ra is not defined
it seems that nodejs jumps to computing or updating without waiting for the query complete.
You're declaring the variables like ra inside of a function block, so they're scoped to that function and unavailable elsewhere. You'd need to put them somewhere more accessible. For example, you could put them in a global variable:
var ra;
async.series([ ... ]);
Further, when you use series, you should call the callback function only when all of the work for that step has completed.
For example, one task should look like this:
db.collection('heros',function(err,collection){
if(err){
console.log(err); // log the error
callback(err); // call the async call back with the error
return; // stop
}
console.log("2 collection fetched!");
collection.findOne({'id':win},function(err,result){
if (err) { callback(err); return; } // again, handle error
rate1 = result.rate; // grab results
console.log("win-rate:"+rate1); // log
callback(null, rate1); // now indicate complete by calling
});
});
As you've got multiple async function calls within a single task, you may want to split them to multiple tasks or consider using the parallel function as well. You could even use parallel within one of the other tasks in the series call to handle the case where you have two findOne calls that need to be made.

Process.nextTick or child_process?

I am trying to allow users to export their contact list in csv format. I am confused on how to run export_connect_csv() function. should i put it in child process or process.nextTick?
function export_connect_csv(user_id, file_location){
mysqlPool.getConnection(function(err, connection){
var csv_row = "Email,First Name,Last Name,Status,Created\n";
function processRow (row) {
var csv_row = row.email+','+row.first_name+','+row.last_name+','+row.status+','+row.created+"\n";
fs.appendFile(file_location, csv_row, function (err) {
if(err){
throw err;
}
});
}
fs.appendFile(file_location, csv_row, function (err) {
if(err){
throw err;
}
var query = connection.query('SELECT * FROM contacts where user_id = "'+user_id+'"');
query
.on('error', function(err) {
//handle error
})
.on('fields', function(fields) {
})
.on('result', function(row) {
processRow(row);
})
.on('end', function() {
//email now
console.log('done');
});
});
});
}
var exportContacts = function(req, res){
var user_id = req.params.user_id || 0;
export_connect_csv(user_id);
res.json({});
};
You don't need to use either, you can just call the function. All of that code will run assynchronously, both getConnection and fs.appendFile. However, you will run into a conflict in the case two users try to export at the same time. You have the following options:
1) You pass a unique file_name every time you call that function
2) You keep things exactly as they are and use fs.appendFileSync to make sure they don't overlap each other but that would block you
3) Or probably the best solution is do what you intended to do with the Process.nextTick, but instead you should use setImmediate and appendFileSync to be able to synchronize writes from several users simultaneously (write only a row at a time to avoid blocking for long periods):
setImmediate(function () {
fs.appendFileSync('filename', JUST_A_SINGLE_ROWW)
});
This is because a recursive process.nextTick can starve the event loop and effecively block you (hence the use of setImmediate) and you need to use fs.appendFileSync because two users might write to the same file simultaneously.
More on setImmediate vs nextTick:
setImmediate vs. nextTick
More info on appendFile: http://nodejs.org/api/fs.html#fs_fs_appendfile_filename_data_options_callback

Resources