I need to insert some matrix in mongodb,so I wrote simple following code
var MongoClient = require('mongodb').MongoClient;
var matrisMaker = function(d1,d2){
var result = new Array();
for (var i = 0;i < d1;i++){
result.push(new Array());
for (var k = 0;k < d2;k++){
result[i].push(Math.round(Math.random() * 1000000000000));
}
}
return result;
};
MongoClient.connect('mongodb://127.0.0.1:27017/test', function(err, db) {
if(err) throw err;
var collection = db.collection('matris');
for (var counter = 0;counter < 10000000;counter++){
var insertObject = {
'matrisA':matrisMaker(20,20),
'matrisB':matrisMaker(20,20),
'resultA':new Object(),
'resultB':new Object()
};
collection.insert(insertObject, function(err, docs) {
if (err)
throw err;
});
delete insertObject;
if ((counter % 1000) == 0)
console.log(counter);
}
db.close();
})
when I see log it printed that too many records inserted,like 50,000,but when I use mongodb to count amount of records,it display less,something near 1,000 records.
>use test;
>db.matris.count();
where is the problem?
Your asynchronous code is flawed and your db.close() line executes before your asynchronous insert commands have all completed. You need to control the flow of your program to A) not have a million concurrent database inserts happening/queued and B) wait until they have all been processed by mongo before closing the connection. Consider a helper library such as async.forEach to help with this if you don't want to code it yourself.
Related
I'm following this link for finding data in mongoDB using node.js
My code is:
var counter = 0;
var findMongo = function(db, callback) {
var cursor =db.collection('new').find( { "_id": ObjectId("56da6fd166efee0350399c21") } );
//var cursor =db.collection('new').find();
cursor.each(function(err, doc) {
counter = counter + 1;
console.log(counter);
assert.equal(err, null);
if (doc != null) {
//console.dir(doc);
//console.log(doc);
} else {
console.log("in else,not found");
callback();
}
});
};
MongoClient.connect(url, function(err, db) {
assert.equal(null, err);
findMongo(db, function() {
db.close();
});
});
Since I'm searching the DB with _id, findMongo should only run once.
I'm getting following result:
counter 1
counter 2
in else,not found
Why is the findMongo function called twice?
Two things to be noticed:
1 - You are using counter = counter + 1; twice, its just creating confusion.
2 - You should use findOne instead find, it makes sense and is good approach because you are interested in finding one-record only whereas there is no harm in using later one.
Here is how to use db.collection.findOne()
I have a small program that reads each of the record and update each of the record. Given the async nature of node and callback. what is the efficient and the correct way to close the db connection?
Sample Program:
var MongoClient = require('mongodb').MongoClient;
var updateCount = 0;
MongoClient.connect('mongodb://localhost:27017/school', function(err, db) {
if(err) throw err;
var query = { };
// get all the students in the database
var cursor = db.collection('students').find(query);
cursor.each(function(err, doc) {
if(err) throw err;
if(doc == null) {
return;
}
// filter out only the homework scores
var homeworksOnly = doc.scores.filter(function(scores){
if (scores.type === "homework") return true;
return false;
})
// filter out the non homework scores
var notHomeWorks = doc.scores.filter(function(scores){
if (scores.type !== "homework") return true;
return false;
})
// sort the homework score to remove the min score from the list.
homeworksOnly.sort(function(a,b){
if (a.score > b.score) return 1;
if (b.score > a.score) return -1;
return 0;
});
console.log("Before removing the min score"+doc._id);
console.dir(homeworksOnly);
console.log("After removing the min score"+doc._id);
homeworksOnly.splice(0,1);
console.dir(homeworksOnly);
console.log("Merge the homework with other scores"+doc._id);
var newScores = homeworksOnly.concat(notHomeWorks);
console.dir(newScores);
console.log("*****");
// Now update the database for this student with the new scores
var search = {"_id":doc._id};
var operator = { '$set' : { 'scores' : newScores } };
db.collection('students').update(search, operator, function(err, updated) {
if(err) throw err;
updateCount++;
console.dir("Successfully updated " + updated + " document! count: "+updateCount);
});
});
});
Now the program works but I need to hit the Ctrl+C to terminate the program. Is there a way to know that all the callbacks have completed so that the program can be terminated?
There are better libaries you can integrate with nodejs to handle the callback flow better, but simply working with the basic driver as a dependency, all you need is the basic node stream interface which is already built in to the cursor.
This allows .pause() and .resume()for flow control when processing, and an "end" event when the cursor stream is complete:
var MongoClient = require('mongodb').MongoClient;
var updateCount = 0;
MongoClient.connect('mongodb://localhost:27017/school', function(err, db) {
if(err) throw err;
var query = { };
// get all the students in the database
var cursor = db.collection('students').find(query);
// called on errors
cursor.on("error",function(err) {
throw err;
});
// called on stream complete
cursor.on("end",function() {
db.close();
});
// process each document in the stream
cursor.on("data",function(data) {
cursor.pause(); // stops the cursor stream while processing
// filter out only the homework scores
var homeworksOnly = doc.scores.filter(function(scores){
if (scores.type === "homework") return true;
return false;
})
// filter out the non homework scores
var notHomeWorks = doc.scores.filter(function(scores){
if (scores.type !== "homework") return true;
return false;
})
// sort the homework score to remove the min score from the list.
homeworksOnly.sort(function(a,b){
if (a.score > b.score) return 1;
if (b.score > a.score) return -1;
return 0;
});
console.log("Before removing the min score"+doc._id);
console.dir(homeworksOnly);
console.log("After removing the min score"+doc._id);
homeworksOnly.splice(0,1);
console.dir(homeworksOnly);
console.log("Merge the homework with other scores"+doc._id);
var newScores = homeworksOnly.concat(notHomeWorks);
console.dir(newScores);
console.log("*****");
// Now update the database for this student with the new scores
var search = {"_id":doc._id};
var operator = { '$set' : { 'scores' : newScores } };
db.collection('students').update(search, operator, function(err, updated) {
if(err) throw err;
updateCount++;
console.dir("Successfully updated " + updated + " document! count: "+updateCount);
cursor.resume(); // restarts the stream processing now we are done
});
});
});
After the update statement is done use
db.collection('students').update(search, operator, function(err, updated) {
if(err) throw err;
updateCount++;
console.dir("Successfully updated " + updated + " document! count: "+updateCount);
});
db.close();
So let's say I have the following for loop
for(var i = 0; i < array.length; i++){
Model.findOne({ _id = array[i].id}, function(err, found){
//Some stuff
});
}
How do I make this code work? Every time I run it I get array[i] = undefinedbecause the mongo-db query is asynchronous and the loop has already iterated 5 times by the time the first query is even completed. How do I go about tackling this issue and waiting for the query to complete before going on to the next iteration?
This doesn't specifically answer your question, but addresses your problem.
I'd use an $in query and do the filtering all at once. 20 calls to the db is pretty slow compared to 1:
// grab your ids
var arrayIds = myArray.map(function(item) {
return item._id;
});
// find all of them
Model.find({_id: {$in: arrayIds}}, function(error, foundItems) {
if (error) {
// error handle
}
// set up a map of the found ids
var foundItemsMap = {};
foundItems.forEach(function(item) {
foundItemsMap[item._id] = true;
});
// pull out your items that haven't been created yet
var newItems = [];
for (var i = 0; i < myArray.length; i++) {
var arrayItem = myArray[i];
if ( foundItemsMap[arrayItem._id] ) {
// this array item exists in the map of foundIds
// so the item already exists in the database
}
else {
// it doesn't exist, push it into the new array
newItems.push(arrayItem);
}
}
// now you have `newItems`, an array of objects that aren't in the database
});
One of the easiest ways to accomplish something like you want is using promises. You could use the library q to do this:
var Q = require('q');
function fetchOne(id) {
var deferred = Q.defer();
Model.findOne({ _id = id}, function(err, found){
if(err) deferred.reject(err);
else deferred.resolve(found);
});
return deferred.promise;
}
function fetch(ids, action) {
if(ids.length === 0) return;
var id = ids.pop();
fetchOne(id).then(function(model) {
action(model);
fetch(ids, action);
});
}
fetch([1,2,3,4,5], function(model) { /* do something */ });
It is not the most beautiful implementation, but I'm sure you get the picture :)
Not sure if this is the right way, it could be a bit expensive but this how i did it.
I think the trick is to pull all your data and then looking for an id match.
Model.find(function(err, data) {
if (err) //handle it
for (var i=0; i<array.length; i++) {
for (var j=0; ij<data.length; j++) {
if(data[j].id == array[i].id) {
// do something
}
}
}
}
This question has been asked BUT the answer that the OP accepted did not answer my particular needs.
closing mongodb connection in node.js while inserting lot of data
I have a utility script that adds a lot of records to multiple collections. Really it is just an import that uses byline to read the VERY LARGE text files and then inserts the data into a collection:
var MongoClient = require("mongodb").MongoClient;
var fs = require("fs");
var byline = require("byline");
var inStream = fs.createReadStream("data.txt", { encoding: "utf8" });
var byLineStream = byline.createStream(inStream);
MongoClient.connect("mongodb://localhost:27017/test", { native_parser: true}, function(err, db) {
var collection = db.collection("Data");
db.dropCollection("Data", function(err, result) {
byLineStream.on("data", function(line) {
var o = parseLineToObject(line);
collection.insert(o);
});
});
});
The answer suggested was to push all the data into an array and then use a single write and a callback to close the database when it is done. This is not a good answer as the files I am working with are very large and so consume large amounts of memory.
Another solution presented to a similar question was to use the async package to create an array of functions and then run them in parallel. Another bust but at least it doesn't create a huge single insert.
So the question: How do I close MongoDB connection once all the inserts are complete so that my script exits and does not hang?
I should add that I have tried the counting method where I increment a counter variable in the insert callback. It doesn't work because at some point in the inserts, the callbacks execute and complete faster than the inserts complete causing the counter to hit 0 while the inserts are still going, and thus closing the db.
You should set a flag when all lines have been read:
var readAllLines = false;
byLineStream.on("end", function() {
readAllLines = true;
});
Next, you check for that flag after inserting each record. However, you also need to keep track of the number of lines that have been read, and how many are inserted, so you'll only close the database if all lines have been inserted (even out of order).
Putting everything together:
db.dropCollection("Data", function(err, result) {
var lineCount = 0;
var readAllLines = false;
byLineStream.on("end", function() {
readAllLines = true;
});
byLineStream.on("data", function(line) {
lineCount++;
var o = parseLineToObject(line);
collection.insert(o, { w : 1 }, function() {
if (--lineCount === 0 && readAllLines) {
// we've read and inserted all lines
db.close();
}
});
});
});
However, I do believe that passing a callback to insert ('safe mode') is slower than your current solution, where you call insert but don't wait wait for its result. To speed things up, instead of writing each lines separately, you can buffer an X amount of lines before inserting them in one statement.
Something similar to this (without the line counting):
var buffer = [];
byLineStream.on("data", function(line) {
buffer.push(parseLineToObject(line));
if (buffer.length > 100 || readAllLines) {
collection.insert(buffer, { w : 1 }, function() {
if (readAllLines) {
db.close();
}
});
buffer = [];
}
});
var MongoClient = require("mongodb").MongoClient;
var fs = require("fs");
var byline = require("byline");
var inStream = fs.createReadStream("data.txt", { encoding: "utf8" });
var byLineStream = byline.createStream(inStream);
MongoClient.connect("mongodb://localhost:27017/test", { native_parser: true}, function(err, db) {
var collection = db.collection("Data");
db.dropCollection("Data", function(err, result) { //I am completely replacing collection
var insertCount = 0;
var doneReadingFile = false;
byLineStream.on("end",function(line) {
doneReadingFile = true;
});
byLineStream.on("data", function(line) {
var o = parseLineToObject(line);
collection.insert(o,function(err, result) {
insertCount--;
if (insertCount === 0 && doneReadingFile) {
db.close();
}
});
});
});
});
I use the following code to loop insert 1000000 documents to mongodb,but i found node process takes up a lot of memory,my client are dead.
db.collection("batch_insert", function (err, collection) {
if (!err) {
var count = 0;
for (var i = 0; i < 1000000; i++) {
collection.insert({hello:'world', ok:'OKOKOK'}, {safe:true, serializeFunctions:false}, function (err, result) {
count++;
if (1000000 == count) {
db.close();
}
});
}
} else {
console.log(err);
}
});
Your for cycle blocks event loop. And it can't go to nextTick and handle query results until all queries sended to mongodb. You need to use asynchronous way to batch insert data.
Something like this:
var mongo = require('mongodb');
var Inserter = function (collection) {
this.collection = collection;
this.data = [];
this.maxThreads = 6;
this.currentThreads = 0;
this.batchSize = 5000;
this.queue = 0;
this.inserted = 0;
this.startTime = Date.now();
};
Inserter.prototype.add = function(data) {
this.data.push(data);
};
// Use force=true for last insert
Inserter.prototype.insert = function(force) {
var that = this;
if (this.data.length >= this.batchSize || force) {
if (this.currentThreads >= this.maxThreads) {
this.queue++;
return;
}
this.currentThreads++;
console.log('Threads: ' + this.currentThreads);
this.collection.insert(this.data.splice(0, this.batchSize), {safe:true}, function() {
that.inserted += that.batchSize;
var currentTime = Date.now();
var workTime = Math.round((currentTime - that.startTime) / 1000)
console.log('Speed: ' + that.inserted / workTime + ' per sec');
that.currentThreads--;
if (that.queue > 0) {
that.queue--;
that.insert();
}
});
}
};
var db = new mongo.Db('test', new mongo.Server('localhost', 27017, {}), {native_parser:false});
db.open(function(err, db) {
db.collection('test', function(err, collection) {
var inserter = new Inserter(collection);
setInterval(function() {
for (var i = 0; i < 5000; i++) {
inserter.add({test:'test'});
}
inserter.insert();
}, 0);
});
});
mongodb, just like any other database, takes some time to process requests. You're throwing a million requests at it, and since nothing in your code blocks, that means that at any time a whole bunch of them are going to be queued up somewhere (most likely in multiple places, with some of them inside the driver's code, others inside node's event loop). That takes more than a little bit of memory.
If the queuing didn't happen, you'd either block or drop some of the requests. There Ain't No Such Thing As A Free Lunch.