How I can output result of MongoDB aggregation into collection without replacing the collection from another aggregation output?
I need to get data only with $out: 'tempCollection', because I have 500mln documents, and getting pipeline stage limit
var q = [
{$match: query},
{$group: {_id: '$hash'}},
{$out: 'tempCollection'}
];
async.parallel([
function(callback) {
firstCollection.aggregate(q, callback);
},
function(callback) {
secondCollection.aggregate(q, callback);
},
...
], function() {
// I want to get all from tempCollection (with pagination) here
});
The bottom line here is that the $out option only ever "replaces" output on the target collection. So to do anything else you must work through a client connection rather than just outputting to the server.
Your best option here with mongoose is to step straight into the underlying driver and get access to the node stream interface as supported by the driver.
Trival example, but it shows the basic way to structure:
var async = require('async'),
mongoose = require('mongoose'),
Schema = mongoose.Schema;
mongoose.connect('mongodb://localhost/aggtest');
var testSchema = new Schema({},{ "_id": false, strict: false });
var ModelA = mongoose.model( 'ModelA', testSchema ),
ModelB = mongoose.model( 'ModelB', testSchema ),
ModelC = mongoose.model( 'ModelC', testSchema );
function processCursor(cursor,target,callback) {
cursor.on("end",callback);
cursor.on("error",callback);
cursor.on("data",function(data) {
cursor.pause();
target.update(
{ "_id": data._id },
{ "$setOnInsert": { "_id": data._id } },
{ "upsert": true },
function(err) {
if (err) callback(err);
cursor.resume();
}
);
});
}
async.series(
[
// Clean data
function(callback) {
async.each([ModelA,ModelB,ModelC],function(model,callback) {
model.remove({},callback);
},callback);
},
// Sample data
function(callback) {
async.each([ModelA,ModelB],function(model,callback) {
async.each([1,2,3],function(id,callback) {
model.create({ "_id": id },callback);
},callback);
},callback);
},
// Run merge
function(callback) {
async.parallel(
[
function(callback) {
var cursor = ModelA.collection.aggregate(
[
{ "$group": { "_id": "$_id" } }
],
{ "batchSize": 25 }
);
processCursor(cursor,ModelC,callback)
},
function(callback) {
var cursor = ModelB.collection.aggregate(
[
{ "$group": { "_id": "$_id" } }
],
{ "batchSize": 25 }
);
processCursor(cursor,ModelC,callback)
}
],
callback
);
},
// Get merged
function(callback) {
ModelC.find({},function(err,results) {
console.log(results);
callback(err);
});
}
],
function(err) {
if (err) throw err;
mongoose.disconnect();
}
);
Oustide of that, then you are going to need to $out to "separate" collections, and then merge them in with a similar .update() process, but to keep it "server side" then you need to use .eval().
It's not nice, but that is the only way to keep operations on the server. You can also modify this with "Bulk" operations ( again through the same native .collection interface ) for a bit more throughput. But the options come down to "read through the client" or "eval".
Related
I am using the below code to insert data to mongodb
router.post('/NewStory', function (req, res) {
var currentObject = { user: userId , story : story , _id:new ObjectID().toHexString() };
req.db.get('clnTemple').findAndModify({
query: { _id: req.body.postId },
update: { $addToSet: { Stories: currentObject } },
upsert: true
});
});
This code is working fine if i remove the _id:new ObjectID().toHexString()
What i want to achieve here is that for every new story i want a unique _id object to be attached to it
What am i doing wrong?
{
"_id": {
"$oid": "55ae24016fb73f6ac7c2d640"
},
"Name": "some name",
...... some other details
"Stories": [
{
"userId": "105304831528398207103",
"story": "some story"
},
{
"userId": "105304831528398207103",
"story": "some story"
}
]
}
This is the document model, the _id that i am trying to create is for the stories
You should not be calling .toHexString() on this as you would be getting a "string" and not an ObjectID. A string takes more space than the bytes of an ObjectId.
var async = require('async'),
mongo = require('mongodb'),
db = require('monk')('localhost/test'),
ObjectID = mongo.ObjectID;
var coll = db.get('junk');
var obj = { "_id": new ObjectID(), "name": "Bill" };
coll.findAndModify(
{ "_id": new ObjectID() },
{ "$addToSet": { "stories": obj } },
{
"upsert": true,
"new": true
},
function(err,doc) {
if (err) throw err;
console.log(doc);
}
)
So that works perfectly for me. Noting the "new" option there as well so the modified document is returned, rather than the original form of the document which is the default.
{ _id: 55c04b5b52d0ec940694f819,
stories: [ { _id: 55c04b5b52d0ec940694f818, name: 'Bill' } ] }
There is however a catch here, and that is that if you are using $addToSet and generating a new ObjectId for every item, then that new ObjectId makes everything "unique". So you would keep adding things into the "set". This may as well be $push if that is what you want to do.
So if userId and story in combination already make this "unique", then do this way instead:
coll.findAndModify(
{
"_id": docId,
"stories": {
"$not": { "$elemMatch": { "userId": userId, "story": story } }
}
},
{ "$push": {
"stories": {
"userId": userId, "story": story, "_id": new ObjectID()
}
}},
{
"new": true
},
function(err,doc) {
if (err) throw err;
console.log(doc);
}
)
So test for the presence of the unique elements in the array, and where they do not exist then append them to the array. Also noting there that you cannot do an "inequality match" on the array element while mixing with "upserts". Your test to "upsert" the document should be on the primary "_id" value only. Managing array entries and document "upserts" need to be in separate update operations. Do not try an mix the two, otherwise you will end up creating new documents when you did not intend to.
By the way, you can generate an ObjectID just using monk.
var db = monk(credentials.database);
var ObjectID = db.helper.id.ObjectID
console.log(ObjectID()) // generates an ObjectID
Just a note I am fairly new to mongo and more notably very new to using node/js.
I'm trying to write a query to insert new documents or update already existing documents in my collection.
The proposed structure of the collection is:
{ _id: xxxxxxx, ip: "xxx.xxx.xxx.xxx:xxxxxx", date: "xx-xx-xx xxxx" }
Note that my intention is a store an fixed length int for the _id rather than the internal ObjectId (is this possible/considered bad practice?). The int is guaranteed to be unique and comes from another source.
var monk = require('monk');
var db = monk('localhost:27017/cgo_schedule');
var insertDocuments = function(db, match) {
var db = db;
var collection = db.get('cgo_schedule');
collection.findAndModify(
{
"query": { "_id": match.matchId },
"update": { "$set": {
"ip": match.ip,
"date": match.date
},
"$setOnInsert": {
"_id": match.matchId,
}},
"options": { "new": true, "upsert": true }
},
function(err,doc) {
if (err) throw err;
console.log( doc );
}
);
}
This doesn't work at all however. It doesn't insert anything to the database, but it also gives no errors, so I have no idea what I'm doing wrong.
The output (for console.log (doc)) is null.
What am I doing wrong?
The Monk docs aren't much help, but according to the source code, the options object must be provided as a separate parameter.
So your call should look like this instead:
collection.findAndModify(
{
"query": { "_id": match.matchId },
"update": {
"$set": {
"ip": match.ip,
"date": match.date
}
}
},
{ "new": true, "upsert": true },
function(err,doc) {
if (err) throw err;
console.log( doc );
}
);
Note that I removed the $setOnInsert part as the _id is always included on insert with an upsert.
Does anybody know how one would go about using the $out operator to push the results of a MongoDB aggregation function into a new collection in node.js?
This is what I have:
var fs = require('fs');
var assert = require('assert');
var ObjectId = require('mongodb').ObjectID;
var MongoClient = require('mongodb').MongoClient
, format = require('util').format;
var createGroups = function(db, callback) {
db.collection('people').aggregate(
[
{ $group: { "_id": "$code", "sendees" : {$push : "$email"}, "count": { $sum: 1 } } }
]
).toArray(function(err, result) {
assert.equal(err, null);
console.log(result);
callback(result);
});
};
MongoClient.connect('mongodb://localhost:12121/systest', function(err, db) {
assert.equal(null, err);
createGroups(db, function() {
db.close();
});
});
Which outputs to the console fine, exactly as I'd expect - but I'm having little luck trying to export this to a new collection.
Thanks!
You just need to supply $out in your aggregation. See the docs.
So your code should look like:
db.collection('people').aggregate(
[
{
$group: {
"_id": "$code",
"sendees" : {$push : "$email"},
"count": { $sum: 1 }
}
},
{
$out: "collection name"
}
]
)
You could try use the mongo-aggregate-out package which saves aggregation results to a collection for Mongo versions < 2.6. If your MongoDB version is 2.6 or newer, the module would behave as passthrough and will use the native feature:
var aggregateOut = require('mongo-aggregate-out'),
pipelineArray = [
{
$group: {
"_id": "$code",
"sendees" : {$push : "$email"},
"count": { $sum: 1 }
}
}
];
var createGroups = function(db, callback) {
aggregateOut(db.collection('people'), pipelineArray, { out: "newCollection"},
function (err) {
assert.equal(err, null);
var cur = db.collection('newCollection').find();
callback(cur);
});
};
Thanks for your answers folks - both very helpful. As it happens I was using an outdated version of mongo from before $out came into being - but once I sorted this out and did as you suggested above it worked perfectly.
I have following data in my Mongodb.
{
"_id" : ObjectId("54a0d4c5bffabd6a179834eb"),
"is_afternoon_scheduled" : true,
"employee_id" : ObjectId("546f0a06c7555ae310ae925a")
}
I would like to use populate with aggregate, and want to fetch employee complete information in the same response, I need help in this. My code is:
var mongoose = require("mongoose");
var empid = mongoose.Types.ObjectId("54a0d4c5bffabd6a179834eb");
Availability.aggregate()
.match( { employee_id : empid} )
.group({_id : "$employee_id",count: { $sum: 1 }})
.exec(function (err, response) {
if (err) console.log(err);
res.json({"message": "success", "data": response, "status_code": "200"});
}
);
The response i am getting is
{"message":"success","data":{"_id":"54a0d4c5bffabd6a179834eb","count":1},"status_code":"200"}
My expected response is:
{"message":"success","data":[{"_id":"54aa34fb09dc5a54232e44b0","count":1, "employee":{fname:abc,lname:abcl}}],"status_code":"200"}
You can call the model form of .populate() on the result objects from an aggregate operation. But the thing is you are going to need a model to represent the "Result" object returned by your aggregation in order to do so.
There are a couple of steps, best explained with a complete listing:
var async = require('async'),
mongoose = require('mongoose'),
Schema = mongoose.Schema;
var employeeSchema = new Schema({
"fname": String,
"lname": String
})
var availSchema = new Schema({
"is_afternoon_scheduled": Boolean,
"employee_id": {
"type": Schema.Types.ObjectId,
"ref": "Employee"
}
});
var resultSchema = new Schema({
"_id": {
"type": Schema.Types.ObjectId,
"ref": "Employee"
},
"count": Number
});
var Employee = mongoose.model( "Employee", employeeSchema );
var Availability = mongoose.model( "Availability", availSchema );
var Result = mongoose.model( "Result", resultSchema, null );
mongoose.connect('mongodb://localhost/aggtest');
async.series(
[
function(callback) {
async.each([Employee,Availability],function(model,callback) {
model.remove({},function(err,count) {
console.log( count );
callback(err);
});
},callback);
},
function(callback) {
async.waterfall(
[
function(callback) {
var employee = new Employee({
"fname": "abc",
"lname": "xyz"
});
employee.save(function(err,employee) {
console.log(employee),
callback(err,employee);
});
},
function(employee,callback) {
var avail = new Availability({
"is_afternoon_scheduled": true,
"employee_id": employee
});
avail.save(function(err,avail) {
console.log(avail);
callback(err);
});
}
],
callback
);
},
function(callback) {
Availability.aggregate(
[
{ "$group": {
"_id": "$employee_id",
"count": { "$sum": 1 }
}}
],
function(err,results) {
results = results.map(function(result) {
return new Result( result );
});
Employee.populate(results,{ "path": "_id" },function(err,results) {
console.log(results);
callback(err);
});
}
);
}
],
function(err,result) {
if (err) throw err;
mongoose.disconnect();
}
);
That's the complete example, but taking a closer look at what happens inside the aggregate result is the main point:
function(err,results) {
results = results.map(function(result) {
return new Result( result );
});
Employee.populate(results,{ "path": "_id" },function(err,results) {
console.log(results);
callback(err);
});
}
The first thing to be aware of is that the results returned by .aggregate() are not mongoose documents as they would be in a .find() query. This is because aggregation pipelines typically alter the document in results from what the original schema looked like. Since it is just a raw object, each element is re-cast as a mongoose document for the Result model type defined earlier.
Now in order to .populate() with data from Employee, the model form of this method is called on the array of results in document object form along with the "path" argument to the field to be populated.
The end result fills is the data as it comes from the Employee model it was related to.
[ { _id:
{ _id: 54ab2e3328f21063640cf446,
fname: 'abc',
lname: 'xyz',
__v: 0 },
count: 1 } ]
Different to how you process with find, but it is necessary to "re-cast" and manually call in this way due to how the results are returned.
This is working like applied populate with aggregate using inner query.
var mongoose = require("mongoose");
var empid = mongoose.Types.ObjectId("54a0d4c5bffabd6a179834eb");
Availability.aggregate()
.match( { employee_id : empid} )
.group({_id : "$employee_id",count: { $sum: 1 }})
.exec(function (err, response) {
if (err) console.log(err);
if (response.length) {
var x = 0;
for (var i=0; i< response.length; i++) {
empID = response[i]._id;
if (x === response.length -1 ) {
User.find({_id: empID}, function(err, users){
res.json({"message": "success", "data": users, "status_code": "200"});
});
}
x++;
}
}
}
);
I've been successfully using $in in my node webservice when my mongo arrays only held ids. Here is sample data.
{
"_id": {
"$oid": "52b1a60ce4b0f819260bc6e5"
},
"title": "Sample",
"team": [
{
"$oid": "52995b263e20c94167000001"
},
{
"$oid": "529bfa36c81735b802000001"
}
],
"tasks": [
{
"task": {
"$oid": "52af197ae4b07526a3ee6017"
},
"status": 0
},
{
"task": {
"$oid": "52af197ae4b07526a3ee6017"
},
"status": 1
}
]
}
Notice that tasks is an array, but the id is nested in "task", while in teams it is on the top level. Here is where my question is.
In my Node route, this is how I typically deal with calling a array of IDs in my project, this works fine in the team example, but obviously not for my task example.
app.get('/api/tasks/project/:id', function (req, res) {
var the_id = req.params.id;
var query = req.params.query;
models.Projects.findById(the_id, null, function (data) {
models.Tasks.findAllByIds({
ids: data._doc.tasks,
query: query
}, function(items) {
console.log(items);
res.send(items);
});
});
});
That communicates with my model which has a method called findAllByIds
module.exports = function (config, mongoose) {
var _TasksSchema = new mongoose.Schema({});
var _Tasks = mongoose.model('tasks', _TasksSchema);
/*****************
* Public API
*****************/
return {
Tasks: _Tasks,
findAllByIds: function(data, callback){
var query = data.query;
_Tasks.find({_id: { $in: data.ids} }, query, function(err, doc){
callback(doc);
});
}
}
}
In this call I have $in: data.ids which works in the simple array like the "teams" example above. Once I nest my object, as with "task" sample, this does not work anymore, and I am not sure how to specify $in to look at data.ids array, but use the "task" value.
I'd like to avoid having to iterate through the data to create an array with only id, and then repopulate the other values once the data is returned, unless that is the only option.
Update
I had a thought of setting up my mongo document like this, although I'd still like to know how to do it the other way, in the event this isn't possible in the future.
"tasks": {
"status0": [
{
"$oid": "52995b263e20c94167000001"
},
{
"$oid": "529bfa36c81735b802000001"
}
],
"status1": [
{
"$oid": "52995b263e20c94167000001"
},
{
"$oid": "529bfa36c81735b802000001"
}
]
}
You can call map on the tasks array to project it into a new array with just the ObjectId values:
models.Tasks.findAllByIds({
ids: data.tasks.map(function(value) { return value.task; }),
query: query
}, function(items) { ...
Have you try the $elemMatch option in find conditions ? http://docs.mongodb.org/manual/reference/operator/query/elemMatch/