Closing connection when event stack is empty - node.js

I'm somewhat new to node, and so far I love it, but maybe I'm running into a case of it being a hammer when I need a wrench.
I've got a bunch of data in text files, and I need to load these files into a database - very simple in a normal imperative language. I want to write idiomatic node for this, so would rather use async fs calls (fs.readdir and fs.readFile rather than fs.readdirSync and fs.readFileSync). However, how do I know when all of those operations are done, so then (and only then) it's safe to close the DB connection?
In short (in pseudo-code):
MongoClient.connect(url, function(err, db) {
if (err) throw err;
fs.readDir(path, function(err, files) {
for file in files {
if (interesting(file)) {
fs.readFile(file, function(err, data) {
doc = turnDataIntoDocument(data);
db.collection('foo').insert(doc);
});
}
}
});
// This is the part that won't work right:
db.close()
});
Obviously, the db.close() could happen at any time, probably before all the files are processed, or usually before the directory is even fully-read.
I know there are libraries for dealing with control flow, but I feel like I should understand how to do this at a more fundamental level rather than depending on a library for something so simple - don't close the connection until I'm done with it.

A simple counter should work in this case:
function doInserts(cb) {
MongoClient.connect(url, function(err, db) {
if (err) throw err;
fs.readDir(path, function(err, files) {
var count = files.length;
if (count === 0) {
db.close();
return cb();
}
for (var i = 0; i < files.length; ++i) {
var file = files[i];
if (interesting(file)) {
fs.readFile(file, function(err, data) {
doc = turnDataIntoDocument(data);
db.collection('foo').insert(doc);
if (--count === 0) {
db.close();
cb();
}
});
} else if (--count === 0) {
db.close();
cb();
}
}
});
});
}

Related

MongoDB find function running twice in node.js

I'm following this link for finding data in mongoDB using node.js
My code is:
var counter = 0;
var findMongo = function(db, callback) {
var cursor =db.collection('new').find( { "_id": ObjectId("56da6fd166efee0350399c21") } );
//var cursor =db.collection('new').find();
cursor.each(function(err, doc) {
counter = counter + 1;
console.log(counter);
assert.equal(err, null);
if (doc != null) {
//console.dir(doc);
//console.log(doc);
} else {
console.log("in else,not found");
callback();
}
});
};
MongoClient.connect(url, function(err, db) {
assert.equal(null, err);
findMongo(db, function() {
db.close();
});
});
Since I'm searching the DB with _id, findMongo should only run once.
I'm getting following result:
counter 1
counter 2
in else,not found
Why is the findMongo function called twice?
Two things to be noticed:
1 - You are using counter = counter + 1; twice, its just creating confusion.
2 - You should use findOne instead find, it makes sense and is good approach because you are interested in finding one-record only whereas there is no harm in using later one.
Here is how to use db.collection.findOne()

NodeJS and parallel flow

I'm new with NodeJS. An issue makes me confused is parallel flow. I read an example show this snippet as a technique for controlling parallel flow:
var fs = require('fs');
var fileDir = './files';
fs.readdir(filesDir, function (err, files) {
if (err) throw err;
for (var index in files) {
var task = (function (file) {
return function () {
fs.readFile(file, function (err, text) {
if (err) throw err;
doSomething();
});
}
})(filesDir + '/' + files[index]);
tasks.push(task);
}
for (var index in tasks) {
tasks[index]();
}
});
This code work like a charm, but when I replace it with
for (var index in files) {
var task = function () {
console.log(files[index]);
fs.readFile(filesDir + '/' + files[index], function (err, text) {
if (err) throw err;
doSomething();
});
};
tasks.push(task);
}
for (var index in tasks) {
tasks[index]();
}
It doesn't work as I expected, because the files[index] in loop is always the last file in directory. Could you please explain me what the real flow is?
In short, the function you created have reference for the index variable(not it's value), so when it's executed, the index value is the last file in directory in your case.
Some links: Understanding variable capture by closures in Javascript/Node
Its because index reference will be to its last file. Node js is asynchronous that it ll not wait till read file operation is completed. It ll increment index value.
for (var index in files) {
var task = function () {
console.log(files[index]);
fs.readFile(filesDir + '/' + files[index], function (err, text) {
if (err) throw err;
doSomething();
});
};
tasks.push(task);
}
Since first code uses closures and it passes the current indexed file to a function. It ll take the current indexed file and returns a function with the file as input.
Now that returned function will execute in parallel.

How to retain state of a variable when dealing with asynchronous functions and their callbacks?

If I read a directory and then I wanted to cycle through those files and tell what is a directory with the path file name. How would you keep track of that file variable that? Or should I just be using the sync version?
Here's example code:
fs.readdir("../directory", function(err, files) {
if(err) throw err;
for(var i = 0; i < files.length; i++) {
fs.stat(files[i], function(err, stats) {
//files[i] will be files[ files.length - 1 ] when the callback is called
console.log("%s %s", files[i], stats.isDirectory())
});
}
});
It would be cool if you could pass in your own variables to fs.stat, like after the callback.

Optimal design pattern - functions which process multiple files

Goal is to create distinct functions which separate out the work of loading multiple (xml) files and parsing them. I could do this all in one function, but the nested callbacks begin to get ugly. In other words, I don't want to do this:
// Explore directory
fs.readdir(path, function (err, files) {
if(err) throw err;
// touch each file
files.forEach(function(file) {
fs.readFile(path+file, function(err, data) {
if (err) throw err;
someAsyncFunction ( function (someAsyncFunctionResult) {
// Do some work, then call another async function...
nestedAsynchFunction ( function (nestedAsyncFunctionResult) {
// Do Final Work here, X levels deep. Ouch!
});
});
});
});
});
Instead, I want one function which reads my files and puts each file's XML payload into an array of objects which is returned to the caller (each object represents the name of the file and the XML in the file). Here's the function that might load up reports into an array:
function loadReports (callback) {
var path = "./downloaded-reports/";
var reports = [];
// There are TWO files in this path....
fs.readdir(path, function (err, files) {
if(err) throw err;
files.forEach(function(file) {
fs.readFile(path+file, function(err, data) {
if (err) throw err;
reports.push({ report: file, XML: data.toString()});
//gets called twice, which makes for strangeness in the calling function
callback(null, reports);
});
});
// callback won't work here, returns NULL reports b/c they haven't been processed yet
//callback(null, reports);
});
}
...and here's the function which will call the one above:
function parseReports() {
loadReports( function(err, data) {
console.log ("loadReports callback");
var reportXML = new jsxml.XML(data[0].XML);
var datasources = reportXML.child('datasources').child('datasource').child('connection').attribute("dbname").toString();
console.log(JSON.stringify(datasources,null, 2));
// More async about to be done below
} );
}
As you can see in the loadReports() comments, I can't get the callback to work right. It either calls back BEFORE the array is has been populated at all, or it calls back twice - once for each fs.readFile operation.
SO...what is the best way to deal with this sort of situation? In brief - What's the best design pattern for a function which processes multiple things asynchronously, so that it ONLY calls back when all "things" have been completely processed? The simpler the better. Do I need to use some sort of queuing module like Q or node-queue?
Thanks much!
Edit: Something like this works inside the deepest loop in terms of not hitting the callback twice, but it seems like a kludge:
fs.readdir(path, function (err, files) {
if(err) throw err;
files.forEach(function(file) {
fs.readFile(path+file, function(err, data) {
if (err) throw err;
reports.push({ report: file, XML: data.toString()});
// WORKS, but seems hacky.
if (reports.length = files.length) callback(null, reports);
});
});
});

mongodb nodejs each vs toArray

I've had a quick look around and not found anything that's satisfied me with an answer but basically I've started to use node.js with express and mongodb to create a webapi rather than the usual .Net MVC Web API route.
One thing I've noted though is in order to return a collection of results I'm doing it in a rather bulky way, or that's how it feels at least.
app.get('/property', function (req, res) {
var propArray = [];
MongoClient.connect(settings.connection,
function (err, db) {
if (err) throw err;
var properties = db.collection("PROPERTIES");
var searchParams = {
Active: true,
Deleted: false
}
properties.count(searchParams, function (err, count) {
properties.find(searchParams).toArray(function (err, result) {
for (i = 0; i < count; i++)
propArray.push(new models.propertyModel(result[i]));
db.close();
return res.json(propArray);
});
});
}
);
});
Now I've noted that there's a .each function rather than .toArray which I would prefer to use as I could cut out the .count function but obviously you can only return a response once. I wondered if you guys could enlighten me with some of your mongo knowledge.
properties.find(searchParams).each(function (err, result) {
return res.json(result);
});
Something like that, cutting out 6 lines of code and an extra call to the database.
The count() can still be cut out with toArray():
properties.find(searchParams).toArray(function (err, result) {
var i, count;
for (i = 0, count = result.length; i < count; i++) {
propArray.push(new models.propertyModel(result[i]));
}
db.close();
return res.json(propArray);
});

Resources