I have a big collection in MongoDB: size - 94.605.081.327 B count - 54.738.234. I have to change all documents in this collection, and it can't be done in a single update.
I would like to stream the data and update the collection one document at a time. Another developer on the same project recommended an approach like this:
stream.on('data', (data)=>{
stream.pause();
data.field = 'newValue'; // update
data.save((err)=>{
stream.resume();
})
})
Is it a good idea? Is there a more efficient way of doing this update with Node.js and Mongoose?
You can use bulk operation. Check out MongoDB documentation for detailed information. If you need to involve mongoose, your code would look something like this:
var bulk = Items.collection.initializeOrderedBulkOp();
bulk.find(query).update(update);
bulk.execute(function(error) {
callback();
});
Related
async.parallel([
function(callback){
con.Attandance.insert({'xxx':'a'}, function(err,data) {
console.log(data);
callback();
});
}, function(callback) {
console.log(data);
con.Profile.insert({'xxx':'a'},function(err){callback()});
}
], function(err) {
console.log('Both a and b are saved now');
});
Attendance.insert() works either Profile.insert() execute or fails. I want if any of them fails data should not be saved in any collection either in Attendance or in Profile
What you mean are transactions, which have nothing to do with synchronous / asynchronous.
Unfortunately, MongoDB simply does not support transactions. The only way to achieve something even remotely close, you have to perform either a two phase commit, or implement a custom rollback logic to undo all changes to Attandance if the changes to Profile failed.
The only possibility to at least achieve atomic (yet not transactions!) updates, is by changing your model. If the Profile is a container for all Attandance instances, you can update the entire object at one. It's impossible to update more than one object atomically with MongoDB, and neither is it possible to achieve a strict order of transactions.
If you need that, go for an SQL database instead. Pretty much all (except SQlite) support transactions.
I wrote a library that implements the two phase commit system (mentioned in a prior answer) described in the docs. It might help in this scenario. Fawn - Transactions for MongoDB.
var Fawn = require("Fawn");
// intitialize Fawn
Fawn.init("mongodb://127.0.0.1:27017/testDB");
/**
optionally, you could initialize Fawn with mongoose
var mongoose = require("mongoose");
mongoose.connect("mongodb://127.0.0.1:27017/testDB");
Fawn.init(mongoose);
**/
// after initialization, create a task
var task = Fawn.Task();
task.save("Attendance", {xxx: "a"})
.save("Profile", {xxx: "a"})
.run()
.then(function(results){
// task is complete
// result from first operation
var firstUpdateResult = results[0];
// result from second operation
var secondUpdateResult = results[1];
})
.catch(function(err){
// Everything has been rolled back.
// log the error which caused the failure
console.log(err);
});
In order to improve the performance of many single Mongo document updates #Node, I consider using Mongo's Bulk operation - to update as many as 1000 documents at each iteration.
In this use case, each individual update opeartion may or may not occur - an update will only occur if the document version had not changed since it was last read by the updater. If a docuemnt was not updated, the application needs to retry and/or do other stuff to hadnle the situation.
Currently the Node code looks like this:
col.update(
{_id: someid, version:someversion},
{$set:{stuf:toupdate, version:(someversion+1)}},
function(err, res) {
if (err) {
console.log('wow, something is seriously wrong!');
// do something about it...
return;
}
if (!res || !res.result || !res.result.nModified) { // no update
console.log('oops, seems like someone updated doc before me);
// do something about it...
return;
}
// Great! - Document was updated, continue as usual...
});
Using Mongo's Bulk unordered operations, is there a way to know which of the group of (1000) updates had succeeded and which had not been performed (in this case due to wrong version)?
The code I started playing with looks like:
var bulk = col.initializeUnorderedBulkOp();
bulk.find({_id: someid1, version:someversion1}).updateOne(
{$set:{stuf:toupdate1, version:(someversion1+1)}});
bulk.find({_id: someid2, version:someversion2}).updateOne(
{$set:{stuf:toupdate2, version:(someversion2+1)}});
...
bulk.find({_id: someid1000, version:someversion1000}).updateOne(
{$set:{stuf:toupdate1000, version:(someversion1000+1)}});
bulk.execute(function(err, result) {
if (err) {
console.log('wow, something is seriously wrong!');
// do something about it...
return;
}
if (result.nMatched < 1000) { // not all got updated
console.log('oops, seems like someone updated at least one doc before me);
// But which of the 1000 got updated OK and which had not!!!!
return;
}
// Great! - All 1000 documents got updated, continue as usual...
});
I was unable to find a Mongo solution for that.
The solution I used was to revert to per document operation if the bulk operation failed... This gives reasonable performance in most cases.
I have a mongodb Relationships collection that stores the user_id and the followee_id(person the user is following). If I query for against the user_id I can find all the the individuals the user is following. Next I need to query the Users collection against all of the returned followee ids to get their personal information. This is where I confused. How would I accomplish this?
NOTE: I know I can embed the followees in the individual user's document and use and $in operator but I do not want to go this route. I want to maintain the most flexibility I can.
You can use an $in query without denormalizing the followees on the user. You just need to do a little bit of data manipulation:
Relationship.find({user_id: user_id}, function(error, relationships) {
var followee_ids = relationships.map(function(relationship) {
return relationship.followee_id;
});
User.find({_id: { $in: followee_ids}}, function(error, users) {
// voila
});
};
if i got your problem right(i think so).
you need to query each of the "individuals the user is following".
that means to query the database multiple queries about each one and get the data.
because the queries in node.js (i assume you using mongoose) are asynchronies you need to get your code more asynchronies for this task.
if you not familier with the async module in node.js it's about time to know it.
see npm async for docs.
i made you a sample code for your query and how it needs to be.
/*array of followee_id from the last query*/
function query(followee_id_arr, callback) {
var async = require('async')
var allResults = [];
async.eachSerias(followee_id_arr, function (f_id, callback){
db.userCollection.findOne({_id : f_id},{_id : 1, personalData : 1},function(err, data){
if(err) {/*handel error*/}
else {
allResults.push(data);
callback()
}
}, function(){
callback(null, allResults);
})
})
}
you can even make all the queries in parallel (for better preformance) by using async.map
I think this is a very simple question? I am a beginner trying to learn mongo with node.
Once I have saved something to a collection, how can I pull it out in simple var format?
db.highschools.save({
hsid :10,
name :"Johnson High School",
location:"San Diego, CA"
});
I simply want to store a var as 'Johnson High School'.
My failed attempts that have returned undefined are as follows...
var hsName = db.highschools.find({hsid:10}).name;
var hsName = db.highschools.find({hsid:10}).name.str;
Pretty sure I'm missing the big picture here, would someone please be kind enough to help me figure this out?
Use findOne instead:
var hsName = db.highschools.findOne({hsid:10}).name;
Also, note that this is a Mongo script, not a NodeJS script.
You'll need to make it async when you write the logic in NodeJS.
db.collection('students', function(err, collection) {
collection.findOne({hsid:10}, function(err, student) {
if (err) { throw err; }
console.log(student.name);
});
});
If you're confident that there should be only one result, then you can use the shortcut method findOne which simply calls find internally with a limit of one. If you were to use find, it returns an array of matches.
I was going through the mongodb and nodejs course on MongoDBUniversity and one of the task involves finding the documents which has the highest recorded temperature for any state and then add a field "month_high" to it.I am able to find the documents for the state with the highest temperature but am unable to update it. The code is as below.
Can someone tell me what might I be doing wrong?
var MongoClient=require('mongodb').MongoClient;
MongoClient.connect('mongodb://localhost:27017/course',function(err,db){
var cursor=db.collection("weather").find();
cursor.sort({"State":1,"Temperature":-1});
var oldState,newState;
cursor.each(function(err,doc){
if(err)throw err;
if(doc==null){
return db.close();
}
newState=doc.State;
if(newState!=oldState){
var operator={'$set':{"month_high":true}};
var query={"_id":doc._id};
console.log(doc._id+" has temp "+doc.Temperature+" "+doc.State);
db.collection("weather").update(doc,operator,function(err,updated){
console.log("hi");//---->Never Logs
if(err)throw err;
// console.log(JSON.stringify(updated));
})
}
oldState=newState;
});
});
I'm not 100% sure, but given the syntax reported on the docs you might have to specify the options parameter even if not using it:
db.collection("weather").update(doc,operator, options, function(err,updated)
Also, the connection might get closed before the callbacks are called. Does it change anything if you remove the db.close() call?
Collection name is 'data'. In this homework 'weather' is database name.
See
https://education.mongodb.com/courses/10gen/M101JS/2013_October/courseware/CRUD/Homework_2.2/
> use weather
switched to db weather
> db.data.findOne()