MongoDB sort by relevance - node.js

I am using trying to get documents from MongoDB on node. Let's say documents have the following structure:
{ "_id": ObjectId, "title" : String, "tags" : Array<String> }
I'd like to sort them by relevance - so when I'm looking for documents that have either "blue" or "yellow" tag I'd like to get ones with both tags first. So far I managed by google, trial and error:
var tags = [ "yellow", "blue" ];
db.collection('files').aggregate([
{ $project : { tags: 1 } },
{ $unwind : "$tags" },
{ $match : { "tags": { "$in": tags } } },
{ $group : { _id: "$_id", relevance: { $sum:1 } } },
{ $sort : { relevance : -1 } },
], function(err, success) {
console.log(success);
});
It works just fine, I get sorted collection of ids:
[{"_id":"5371355045002fc820a09566","relevance":2},{"_id":"53712fc6c8fcd124216de6cd","relevance":2},{"_id":"5371302ebd4725dc1b908316","relevance":1}]
Now I would make another query and ask for documents with those ids - but here's my question: can it be done in one query?

Yes you can as is always the case when you are actually grouping on _id then that value is essentially equivalent to the whole document. So it is just a matter of storing the whole document under the _id field.
You have a couple of approaches to this depending on your MongoDB version, and in versions prior to MongoDB 2.6 you must specify the whole document structure in an initial $project stage ( which may optionally come after a $match which is generally a good idea ) in your pipeline before you actually manipulate the document:
var tags = ["yellow","blue"];
db.collection.aggregate([
{ "$project" : {
"_id": {
"_id": "$_id",
"title": "$title",
"tags": "$tags"
},
"tags": 1
}},
{ "$unwind": "$tags" },
{ "$match": { "tags": { "$in": tags } } },
{ "$group": { "_id": "$_id", "relevance": { "$sum":1 } } },
{ "$sort": { "relevance" : -1 } },
{ "$project": {
_id: "$_id._id",
"title": "$_id.title",
"tags": "$_id.tags"
}}
])
And of course, at the end of the pipeline you extract the information from the _id field in order to get back your original structure. That is optional, but you usually want that.
For MongoDB 2.6 and above there is a variable available to the pipeline stages that holds the structure of the document at that stage of the pipeline known as $$ROOT, and you can access this as a kind of shortcut to the above form like so:
var tags = ["yellow","blue"];
db.collection.aggregate([
{ "$project" : {
"_id": "$$ROOT",
"tags": 1
}},
{ "$unwind": "$tags" },
{ "$match": { "tags": { "$in": tags } } },
{ "$group": { "_id": "$_id", "relevance": { "$sum":1 } } },
{ "$sort": { "relevance" : -1 } },
{ "$project": {
"_id": "$_id._id",
"title": "$_id.title",
"tags": "$_id.tags"
}}
])
Keeping in mind that in order to restore the document you still need to specify all the required fields.
I would note that as you are "filtering" documents with your match condition in this case and as was mentioned earlier, you should actually be filtering with a $match statement at the "head" of your pipeline. This is the only place where the aggregation framework can select an index in order to optimize the query, and it also reduces the number of documents that do not meet your conditions ( presuming that not everything has the tags "yellow" or "blue" ) that go through the remaining pipeline stages:
db.collection.aggregate([
{ "$match": { "tags": { "$in": tags } } },
{ "$project" : {
"_id": {
"_id": "$_id",
"title": "$title",
"tags": "$tags"
},
"tags": 1
}},
{ "$unwind": "$tags" },
{ "$match": { "tags": { "$in": tags } } },
{ "$group": { "_id": "$_id", "relevance": { "$sum":1 } } },
{ "$sort": { "relevance" : -1 } },
{ "$project": {
_id: "$_id._id",
"title": "$_id.title",
"tags": "$_id.tags"
}}
])
At any rate that should be generally more effective than trying to do another query which of course would not maintain your sort order in the way that you have done.

Related

Terribly degraded performance with other join conditions in $lookup (using pipeline)

So during some code review I decided to improve existing query performance by improving one aggregation that was like this:
.aggregate([
//difference starts here
{
"$lookup": {
"from": "sessions",
"localField": "_id",
"foreignField": "_client",
"as": "sessions"
}
},
{
$unwind: "$sessions"
},
{
$match: {
"sessions.deleted_at": null
}
},
//difference ends here
{
$project: {
name: client_name_concater,
email: '$email',
phone: '$phone',
address: addressConcater,
updated_at: '$updated_at',
}
}
]);
to this:
.aggregate([
//difference starts here
{
$lookup: {
from: 'sessions',
let: {
id: "$_id"
},
pipeline: [
{
$match: {
$expr: {
$and:
[
{
$eq: ["$_client", "$$id"]
}, {
$eq: ["$deleted_at", null]
},
]
}
}
}
],
as: 'sessions'
}
},
{
$match: {
"sessions": {$ne: []}
}
},
//difference ends here
{
$project: {
name: client_name_concater,
email: '$email',
phone: '$phone',
address: addressConcater,
updated_at: '$updated_at',
}
}
]);
I thought that the second option should be better, since we have one less stage, but the difference in performance is massive in the opposite way, the first query runs on average ~40ms, the other one ranges between 3.5 - 5 seconds, 100 times more. The other collection (sessions) has around 120 documents, while this one about 152, but still, even if it was acceptable due to data size, why the difference between these two, isn't it basically the same thing, we are just adding the join condition in the pipeline with the other main condition of the join. Am I missing something?
Some functions or variables included there are mostly static or concatenation that shouldn't affect the $lookup part.
Thanks
EDIT:
Added query plans, for version 1:
{
"stages": [
{
"$cursor": {
"query": {
"$and": [
{
"deleted_at": null
},
{}
]
},
"fields": {
"email": 1,
"phone": 1,
"updated_at": 1,
"_id": 1
},
"queryPlanner": {
"plannerVersion": 1,
"namespace": "test.clients",
"indexFilterSet": false,
"parsedQuery": {
"deleted_at": {
"$eq": null
}
},
"winningPlan": {
"stage": "COLLSCAN",
"filter": {
"deleted_at": {
"$eq": null
}
},
"direction": "forward"
},
"rejectedPlans": []
}
}
},
{
"$lookup": {
"from": "sessions",
"as": "sessions",
"localField": "_id",
"foreignField": "_client",
"unwinding": {
"preserveNullAndEmptyArrays": false
}
}
},
{
"$project": {
"_id": true,
"email": "$email",
"phone": "$phone",
"updated_at": "$updated_at"
}
}
],
"ok": 1
}
For version 2:
{
"stages": [
{
"$cursor": {
"query": {
"deleted_at": null
},
"fields": {
"email": 1,
"phone": 1,
"sessions": 1,
"updated_at": 1,
"_id": 1
},
"queryPlanner": {
"plannerVersion": 1,
"namespace": "test.clients",
"indexFilterSet": false,
"parsedQuery": {
"deleted_at": {
"$eq": null
}
},
"winningPlan": {
"stage": "COLLSCAN",
"filter": {
"deleted_at": {
"$eq": null
}
},
"direction": "forward"
},
"rejectedPlans": []
}
}
},
{
"$lookup": {
"from": "sessions",
"as": "sessions",
"let": {
"id": "$_id"
},
"pipeline": [
{
"$match": {
"$expr": {
"$and": [
{
"$eq": [
"$_client",
"$$id"
]
},
{
"$eq": [
"$deleted_at",
null
]
}
]
}
}
}
]
}
},
{
"$match": {
"sessions": {
"$not": {
"$eq": []
}
}
}
},
{
"$project": {
"_id": true,
"email": "$email",
"phone": "$phone",
"updated_at": "$updated_at"
}
}
],
"ok": 1
}
One thing of note, the joined sessions collection has certain properties with very big data (some imported data), so I am thinking that in some way it may be affecting the query size due to this data? But why the difference between the two $lookup versions though.
The second version adds an aggregation pipeline execution for each document in the joined collection.
The documentation says:
Specifies the pipeline to run on the joined collection. The pipeline determines the resulting documents from the joined collection. To return all documents, specify an empty pipeline [].
The pipeline is executed for each document in the collection, not for each matched document.
Depending on how large the collection is (both # of documents and document size) this could come out to a decent amount of time.
after removing the limit, the pipeline version jumped to over 10 seconds
Makes sense - all of the additional documents due to the removal of limit also must have the aggregation pipeline executed for them.
It is possible that per-document execution of aggregation pipeline isn't as optimized as it could be. For example, if the pipeline is set up and torn down for each document, there could easily be more overhead in that than in the $match conditions.
Is there any case when using one or the other?
Executing an aggregation pipeline per joined document provides additional flexibility. If you need this flexibility, it may make sense to execute the pipeline, though performance needs to be considered regardless. If you don't, it is sensible to use a more performant approach.

MongoDB query distinct in subdocuments

I'm using Mongoose with NodeJS (typescript).
I'm trying to sum the count per location. Example output :
[
{ name : "Bronx", count : 6 },
{ name : "Brooklyn", count : 6 },
{ name : "Manhattan", count : 6 },
{ name : "Queens", count : 6 }
]
Current data model:
data:
[
{
"news": {
"_id": "5c7615a4ef5238a6c47cbcb9",
"locations": [
{
"_id": "5c7615a4ef5238a6c47cbcc6",
"id": "1",
"name": "Manhattan",
"children": [
{
"_id": "5c7615a4ef5238a6c47cbcc8",
"count": 3
},
{
"_id": "5c7615a4ef5238a6c47cbcc7",
"count": 2
}
]
}
]
}
},
{
....
}
]
The last query that I build was :
DataModel.aggregate([
{ "$unwind": "$data.news.locations" },
{
"$group": {
"_id": "$data.news.locations",
"count": { "$sum": "$$data.news.locations.zipcodes.count" }
}
}]).exec(function(err, results){
if (err) throw err;
console.log(JSON.stringify(results, null, 4));
});
But I'm new handle queries in MongoDB with Mongoose, so any help I really appreciate. thanks.
You were kind of close, just a few changes:
DataModel.aggregate([
// Each array needs $unwind separately
{ "$unwind": "$data" },
// And then down to the next one
{ "$unwind": "$data.news.locations" },
// Group on the grouping key
{ "$group": {
"_id": "$data.news.locations.name",
"count": { "$sum": { "$sum": "$data.news.locations.children.count" } }
}}
],(err,results) => {
// remaining handling
})
So since you have arrays inside an array and you want to get down to the "name" property inside the "locations" you need to $unwind to that point. You must $unwind each array level separately.
Technically there is still the children array as well, but $sum can be used to "sum an array of values" as well as "accumulate for a grouping key". Hence the $sum: { $sum statement within the $group.
Returns:
{ "_id" : "Manhattan", "count" : 5 }
From the detail supplied in the question.

Return multiple documents which match the latest date with a single MongoDB query in Mongoose

The following is MongoDB query. What is the best possible way to write this query in Mongoose?
db.logs.find({date: db.logs.find({}, {date:1,"_id":0}).sort({date:-1}).limit(1).toArray()[0].date})
There could be multiple documents with the same date and we need to retrieve all the documents that match the latest date.
The aggregation framework makes it possible to write this in a single query. You would require a pipeline that has an initial $lookup operator to do a self-join and within the $lookup execute a pipeline on the joined collection which allows for uncorrelated sub-queries to return the latest date:
db.logs.aggregate([
{ "$lookup": {
"from": "logs",
"pipeline": [
{ "$sort": { "date": -1 } },
{ "$limit": 1 },
{ "$project": { "_id": 0, "date": 1 } }
],
"as": "latest"
} }
])
A further step is required to reshape the new field latest produced above so that the array is flattened. Use $addFields to reshape and $arrayElemAt to flatten the array or use "$unwind":
db.logs.aggregate([
{ "$lookup": {
"from": "logs",
"pipeline": [
{ "$sort": { "date": -1 } },
{ "$limit": 1 },
{ "$project": { "_id": 0, "date": 1 } }
],
"as": "latest"
} },
{ "$addFields": { "latest": { "$arrayElemAt": ["$latest", 0] } } }
])
The final step would be to filter the documents in the resulting pipeline using $expr in a $match stage since you will be comparing fields from the same document:
db.logs.aggregate([
{ "$lookup": {
"from": "logs",
"pipeline": [
{ "$sort": { "date": -1 } },
{ "$limit": 1 },
{ "$project": { "_id": 0, "date": 1 } }
],
"as": "latest"
} },
{ "$addFields": { "latest": { "$arrayElemAt": ["$latest", 0] } } },
{ "$match": {
"$expr": {
"$eq": [ "$date", "$latest.date" ]
}
} }
])
Getting the query into Mongoose becomes a trivial exercise.

Lookup when foreignField is in an Array

I want to lookup from an object to a collection where the foreignField key is embedded into an array of objects. I have:
collection "shirts"
{
"_id" : ObjectId("5a797ef0768d8418866eb0f6"),
"name" : "Supermanshirt",
"price" : 9.99,
"flavours" : [
{
"flavId" : ObjectId("5a797f8c768d8418866ebad3"),
"size" : "M",
"color": "white",
},
{
"flavId" : ObjectId("3a797f8c768d8418866eb0f7"),
"size" : "XL",
"color": "red",
},
]
}
collection "basket"
{
"_id" : ObjectId("5a797ef0333d8418866ebabc"),
"basketName" : "Default",
"items" : [
{
"dateAdded" : 1526996879787.0,
"itemFlavId" : ObjectId("5a797f8c768d8418866ebad3")
}
],
}
My Query:
basketSchema.aggregate([
{
$match: { $and: [{ _id }, { basketName }]},
},
{
$unwind: '$items',
},
{
$lookup:
{
from: 'shirts',
localField: 'items.itemFlavId',
foreignField: 'flavours.flavId',
as: 'ordered_shirts',
},
},
]).toArray();
my expected result:
[{
"_id" : ObjectId("5a797ef0333d8418866ebabc"),
"basketName" : "Default",
"items" : [
{
"dateAdded" : 1526996879787.0,
"itemFlavId" : ObjectId("5a797f8c768d8418866ebad3")
}
],
"ordered_shirts" : [
{
"_id" : ObjectId("5a797ef0768d8418866eb0f6"),
"name" : "Supermanshirt",
"price" : 9.99,
"flavours" : [
{
"flavId" : ObjectId("5a797f8c768d8418866ebad3"),
"size" : "M",
"color": "white",
}
]
}
],
}]
but instead my ordered_shirts array is empty.
How can I use a foreignField if this foreignField is embedded in an array at the other collection?
I am using MongoDB 3.6.4
As commented, it would appear that there is simply something up in your code where you are pointing at the wrong collection. The general case for this is to simply look at the example listing provided below and see what the differences are, since with the data you provide and the correct collection names then your expected result is in fact returned.
Of course where you need to take such a query "after" that initial $lookup stage is not a simple matter. From a structural standpoint, what you have is generally not a great idea since referring "joins" into items within an array means you are always returning data which is not necessarily "related".
There are some ways to combat that, and mostly there is the form of "non-correlated" $lookup introduced with MongoDB 3.6 which can aid in ensuring you are not returning "unnecessary" data in the "join".
I'm working here in the form of "merging" the "sku" detail with the "items" in the basket, so a first form would be:
Optimal MongoDB 3.6
// Store some vars like you have
let _id = ObjectId("5a797ef0333d8418866ebabc"),
basketName = "Default";
// Run non-correlated $lookup
let optimal = await Basket.aggregate([
{ "$match": { _id, basketName } },
{ "$lookup": {
"from": Shirt.collection.name,
"as": "items",
"let": { "items": "$items" },
"pipeline": [
{ "$match": {
"$expr": {
"$setIsSubset": ["$$items.itemflavId", "$flavours.flavId"]
}
}},
{ "$project": {
"_id": 0,
"items": {
"$map": {
"input": {
"$filter": {
"input": "$flavours",
"cond": { "$in": [ "$$this.flavId", "$$items.itemFlavId" ]}
}
},
"in": {
"$mergeObjects": [
{ "$arrayElemAt": [
"$$items",
{ "$indexOfArray": [
"$$items.itemFlavId", "$$this.flavId" ] }
]},
{ "name": "$name", "price": "$price" },
"$$this"
]
}
}
}
}},
{ "$unwind": "$items" },
{ "$replaceRoot": { "newRoot": "$items" } }
]
}}
])
Note that since you are using mongoose to hold details for the models we can use Shirt.collection.name here to read the property from that model with the actual collection name as needed for the $lookup. This helps avoid confusion within the code and also "hard-coding" something like the collection name when it's actually stored somewhere else. In this way should you change the code which registers the "model" in a way which altered the collection name, then this would always retrieve the correct name for use in the pipeline stage.
The main reason you use this form of $lookup with MongoDB 3.6 is because you want to use that "sub-pipeline" to manipulate the foreign collection results "before" they are returned and merged with the parent document. Since we are "merging" the results into the existing "items" array of the basket we use the same field name in argument to "as".
In this form of $lookup you typically still want "related" documents even though it gives you the control to do whatever you want. In this case we can compare the array content from "items" in the parent document which we set as a variable for the pipeline to use with the array under "flavours" in the foreign collection. A logical comparison for the two "sets" of values here where they "intersect" is using the $setIsSubset operator using the $expr so we can compare on a "logical operation".
The main work here is being done in the $project which is simply using $map on the array from the "flavours" array of the foreign document, processed with $filter in comparison to the "items" we passed into the pipeline and essentially re-written in order to "merge" the matched content.
The $filter reduces down the list for consideration to only those which match something present within the "items", and then we can use $indexOfArray and $arrayElemAt in order to extract the detail from the "items" and merge it with each remaining "flavours" entry which matches using the $mergeObjects operator. Noting here that we also take some "parent" detail from the "shirt" as the "name" and "price" fields which are common to the variations in size and color.
Since this is still an "array" within the matched document(s) to the join condition, in order to get a "flat list" of objects suitable for "merged" entries in the resulting "items" of the $lookup we simply apply $unwind, which within the context of matched items left only creates "little" overhead, and $replaceRoot in order to promote the content under that key to the top level.
The result is just the "merged" content listed in the "items" of the basket.
Sub-optimal MongoDB
The alternate approaches are really not that great since all involve returning other "flavours" which do not actually match the items in the basket. This basically involves "post-filtering" the results obtained from the $lookup as opposed to "pre-filtering" which the process above does.
So the next case here would be using methods to manipulate the returned array in order to remove the items which don't actually match:
// Using legacy $lookup
let alternate = await Basket.aggregate([
{ "$match": { _id, basketName } },
{ "$lookup": {
"from": Shirt.collection.name,
"localField": "items.itemFlavId",
"foreignField": "flavours.flavId",
"as": "ordered_items"
}},
{ "$addFields": {
"items": {
"$let": {
"vars": {
"ordered_items": {
"$reduce": {
"input": {
"$map": {
"input": "$ordered_items",
"as": "o",
"in": {
"$map": {
"input": {
"$filter": {
"input": "$$o.flavours",
"cond": {
"$in": ["$$this.flavId", "$items.itemFlavId"]
}
}
},
"as": "f",
"in": {
"$mergeObjects": [
{ "name": "$$o.name", "price": "$$o.price" },
"$$f"
]
}
}
}
}
},
"initialValue": [],
"in": { "$concatArrays": ["$$value", "$$this"] }
}
}
},
"in": {
"$map": {
"input": "$items",
"in": {
"$mergeObjects": [
"$$this",
{ "$arrayElemAt": [
"$$ordered_items",
{ "$indexOfArray": [
"$$ordered_items.flavId", "$$this.itemFlavId"
]}
]}
]
}
}
}
}
},
"ordered_items": "$$REMOVE"
}}
]);
Here I'm still using some MongoDB 3.6 features, but these are not a "requirement" of the logic involved. The main constraint in this approach is actually the $reduce which requires MongoDB 3.4 or greater.
Using the same "legacy" form of $lookup as you were attempting, we still get the desired results as you display but that of course contains information in the "flavours" that does not match the "items" in the basket. In much the same way as shown in the previous listing we can apply $filter here to remove the items which don't match. The same process here uses that $filter output as the input for $map, which again is doing the same "merge" process as before.
Where the $reduce comes in is because the resulting processing where there is an "array" target from $lookup with documents that themselves contain an "array" of "flavours" is that these arrays need to be "merged" into a single array for further processing. The $reduce simply uses the processed output and performs a $concatArrays on each of the "inner" arrays returned to make these results singular. We already "merged" the content, so this becomes the new "merged" "items".
Older Still $unwind
And of course the final way to present ( even though there are other combinations ) is using $unwind on the arrays and using $group to put it back together:
let old = await Basket.aggregate([
{ "$match": { _id, basketName } },
{ "$unwind": "$items" },
{ "$lookup": {
"from": Shirt.collection.name,
"localField": "items.itemFlavId",
"foreignField": "flavours.flavId",
"as": "ordered_items"
}},
{ "$unwind": "$ordered_items" },
{ "$unwind": "$ordered_items.flavours" },
{ "$redact": {
"$cond": {
"if": {
"$eq": [
"$items.itemFlavId",
"$ordered_items.flavours.flavId"
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}},
{ "$group": {
"_id": "$_id",
"basketName": { "$first": "$basketName" },
"items": {
"$push": {
"dateAdded": "$items.dateAdded",
"itemFlavId": "$items.itemFlavId",
"name": "$ordered_items.name",
"price": "$ordered_items.price",
"flavId": "$ordered_items.flavours.flavId",
"size": "$ordered_items.flavours.size",
"color": "$ordered_items.flavours.color"
}
}
}}
]);
Most of this should be pretty self explanatory as $unwind is simply a tool to "flatten" array content into singular document entries. In order to just get the results we want we can use $redact to compare the two fields. Using MongoDB 3.6 you "could" use $expr within a $match here:
{ "$match": {
"$expr": {
"$eq": [
"$items.itemFlavId",
"$ordered_items.flavours.flavId"
]
}
}}
But when it comes down to it, if you have MongoDB 3.6 with it's other features then $unwind is the wrong thing to do here due to all the overhead it will actually add.
So all that really happens is you $lookup then "flatten" the documents and finally $group all related detail together using $push to recreate the "items" in the basket. It "looks simple" and is probably the most easy form to understand, however "simplicity" does not equal "performance" and this would be pretty brutal to use in a real world use case.
Summary
That should cover the explanation of the things you need to do when working with "joins" that are going to compare items within arrays. This probably should lead you on the path of realizing this is not really a great idea and it would be far better to keep your "skus" listed "separately" rather than listing them all related under a single "item".
It also should in part be a lesson that "joins" in general are not a great idea with MongoDB. You really only should define such relations where they are "absolutely necessary". In such a case of "details for items in a basket", then contrary to traditional RDBMS patterns it would actually be far better in terms of performance to simply "embed" that detail from the start. In that way you don't need complicated join conditions just to get a result, which might have saved "a few bytes" in storage but is taking a lot more time than what should have been a simple request for the basket with all the detail already "embedded". That really should be the primary reason why you are using something like MongoDB in the first place.
So if you have to do it, then really you should be sticking with the first form since where you have the available features to use then use them best to their advantage. Whilst other approaches may seem easier, it won't help the application performance, and of course best performance would be embedding to begin with.
A full listing follows for demonstration of the above discussed methods and for basic comparison to prove that the provided data does in fact "join" as long as the other parts of the application set-up are working as they should be. So a model on "how it should be done" in addition to demonstrating the full concepts.
const { Schema, Types: { ObjectId } } = mongoose = require('mongoose');
const uri = 'mongodb://localhost/basket';
mongoose.Promise = global.Promise;
mongoose.set('debug', true);
const basketItemSchema = new Schema({
dateAdded: { type: Number, default: Date.now() },
itemFlavId: { type: Schema.Types.ObjectId }
},{ _id: false });
const basketSchema = new Schema({
basketName: String,
items: [basketItemSchema]
});
const flavourSchema = new Schema({
flavId: { type: Schema.Types.ObjectId },
size: String,
color: String
},{ _id: false });
const shirtSchema = new Schema({
name: String,
price: Number,
flavours: [flavourSchema]
});
const Basket = mongoose.model('Basket', basketSchema);
const Shirt = mongoose.model('Shirt', shirtSchema);
const log = data => console.log(JSON.stringify(data, undefined, 2));
(async function() {
try {
const conn = await mongoose.connect(uri);
// clean data
await Promise.all(Object.entries(conn.models).map(([k,m]) => m.remove()));
// set up data for test
await Basket.create({
_id: ObjectId("5a797ef0333d8418866ebabc"),
basketName: "Default",
items: [
{
dateAdded: 1526996879787.0,
itemFlavId: ObjectId("5a797f8c768d8418866ebad3")
}
]
});
await Shirt.create({
_id: ObjectId("5a797ef0768d8418866eb0f6"),
name: "Supermanshirt",
price: 9.99,
flavours: [
{
flavId: ObjectId("5a797f8c768d8418866ebad3"),
size: "M",
color: "white"
},
{
flavId: ObjectId("3a797f8c768d8418866eb0f7"),
size: "XL",
color: "red"
}
]
});
// Store some vars like you have
let _id = ObjectId("5a797ef0333d8418866ebabc"),
basketName = "Default";
// Run non-correlated $lookup
let optimal = await Basket.aggregate([
{ "$match": { _id, basketName } },
{ "$lookup": {
"from": Shirt.collection.name,
"as": "items",
"let": { "items": "$items" },
"pipeline": [
{ "$match": {
"$expr": {
"$setIsSubset": ["$$items.itemflavId", "$flavours.flavId"]
}
}},
{ "$project": {
"_id": 0,
"items": {
"$map": {
"input": {
"$filter": {
"input": "$flavours",
"cond": { "$in": [ "$$this.flavId", "$$items.itemFlavId" ]}
}
},
"in": {
"$mergeObjects": [
{ "$arrayElemAt": [
"$$items",
{ "$indexOfArray": [
"$$items.itemFlavId", "$$this.flavId" ] }
]},
{ "name": "$name", "price": "$price" },
"$$this"
]
}
}
}
}},
{ "$unwind": "$items" },
{ "$replaceRoot": { "newRoot": "$items" } }
]
}}
])
log(optimal);
// Using legacy $lookup
let alternate = await Basket.aggregate([
{ "$match": { _id, basketName } },
{ "$lookup": {
"from": Shirt.collection.name,
"localField": "items.itemFlavId",
"foreignField": "flavours.flavId",
"as": "ordered_items"
}},
{ "$addFields": {
"items": {
"$let": {
"vars": {
"ordered_items": {
"$reduce": {
"input": {
"$map": {
"input": "$ordered_items",
"as": "o",
"in": {
"$map": {
"input": {
"$filter": {
"input": "$$o.flavours",
"cond": {
"$in": ["$$this.flavId", "$items.itemFlavId"]
}
}
},
"as": "f",
"in": {
"$mergeObjects": [
{ "name": "$$o.name", "price": "$$o.price" },
"$$f"
]
}
}
}
}
},
"initialValue": [],
"in": { "$concatArrays": ["$$value", "$$this"] }
}
}
},
"in": {
"$map": {
"input": "$items",
"in": {
"$mergeObjects": [
"$$this",
{ "$arrayElemAt": [
"$$ordered_items",
{ "$indexOfArray": [
"$$ordered_items.flavId", "$$this.itemFlavId"
]}
]}
]
}
}
}
}
},
"ordered_items": "$$REMOVE"
}}
]);
log(alternate);
// Or really old style
let old = await Basket.aggregate([
{ "$match": { _id, basketName } },
{ "$unwind": "$items" },
{ "$lookup": {
"from": Shirt.collection.name,
"localField": "items.itemFlavId",
"foreignField": "flavours.flavId",
"as": "ordered_items"
}},
{ "$unwind": "$ordered_items" },
{ "$unwind": "$ordered_items.flavours" },
{ "$redact": {
"$cond": {
"if": {
"$eq": [
"$items.itemFlavId",
"$ordered_items.flavours.flavId"
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}},
{ "$group": {
"_id": "$_id",
"basketName": { "$first": "$basketName" },
"items": {
"$push": {
"dateAdded": "$items.dateAdded",
"itemFlavId": "$items.itemFlavId",
"name": "$ordered_items.name",
"price": "$ordered_items.price",
"flavId": "$ordered_items.flavours.flavId",
"size": "$ordered_items.flavours.size",
"color": "$ordered_items.flavours.color"
}
}
}}
]);
log(old);
} catch(e) {
console.error(e)
} finally {
process.exit()
}
})()
And sample output as:
Mongoose: baskets.remove({}, {})
Mongoose: shirts.remove({}, {})
Mongoose: baskets.insertOne({ _id: ObjectId("5a797ef0333d8418866ebabc"), basketName: 'Default', items: [ { dateAdded: 1526996879787, itemFlavId: ObjectId("5a797f8c768d8418866ebad3") } ], __v: 0 })
Mongoose: shirts.insertOne({ _id: ObjectId("5a797ef0768d8418866eb0f6"), name: 'Supermanshirt', price: 9.99, flavours: [ { flavId: ObjectId("5a797f8c768d8418866ebad3"), size: 'M', color: 'white' }, { flavId: ObjectId("3a797f8c768d8418866eb0f7"), size: 'XL', color: 'red' } ], __v: 0 })
Mongoose: baskets.aggregate([ { '$match': { _id: 5a797ef0333d8418866ebabc, basketName: 'Default' } }, { '$lookup': { from: 'shirts', as: 'items', let: { items: '$items' }, pipeline: [ { '$match': { '$expr': { '$setIsSubset': [ '$$items.itemflavId', '$flavours.flavId' ] } } }, { '$project': { _id: 0, items: { '$map': { input: { '$filter': { input: '$flavours', cond: { '$in': [Array] } } }, in: { '$mergeObjects': [ { '$arrayElemAt': [Array] }, { name: '$name', price: '$price' }, '$$this' ] } } } } }, { '$unwind': '$items' }, { '$replaceRoot': { newRoot: '$items' } } ] } } ], {})
[
{
"_id": "5a797ef0333d8418866ebabc",
"basketName": "Default",
"items": [
{
"dateAdded": 1526996879787,
"itemFlavId": "5a797f8c768d8418866ebad3",
"name": "Supermanshirt",
"price": 9.99,
"flavId": "5a797f8c768d8418866ebad3",
"size": "M",
"color": "white"
}
],
"__v": 0
}
]
Mongoose: baskets.aggregate([ { '$match': { _id: 5a797ef0333d8418866ebabc, basketName: 'Default' } }, { '$lookup': { from: 'shirts', localField: 'items.itemFlavId', foreignField: 'flavours.flavId', as: 'ordered_items' } }, { '$addFields': { items: { '$let': { vars: { ordered_items: { '$reduce': { input: { '$map': { input: '$ordered_items', as: 'o', in: { '$map': [Object] } } }, initialValue: [], in: { '$concatArrays': [ '$$value', '$$this' ] } } } }, in: { '$map': { input: '$items', in: { '$mergeObjects': [ '$$this', { '$arrayElemAt': [ '$$ordered_items', [Object] ] } ] } } } } }, ordered_items: '$$REMOVE' } } ], {})
[
{
"_id": "5a797ef0333d8418866ebabc",
"basketName": "Default",
"items": [
{
"dateAdded": 1526996879787,
"itemFlavId": "5a797f8c768d8418866ebad3",
"name": "Supermanshirt",
"price": 9.99,
"flavId": "5a797f8c768d8418866ebad3",
"size": "M",
"color": "white"
}
],
"__v": 0
}
]
Mongoose: baskets.aggregate([ { '$match': { _id: 5a797ef0333d8418866ebabc, basketName: 'Default' } }, { '$unwind': '$items' }, { '$lookup': { from: 'shirts', localField: 'items.itemFlavId', foreignField: 'flavours.flavId', as: 'ordered_items' } }, { '$unwind': '$ordered_items' }, { '$unwind': '$ordered_items.flavours' }, { '$redact': { '$cond': { if: { '$eq': [ '$items.itemFlavId', '$ordered_items.flavours.flavId' ] }, then: '$$KEEP', else: '$$PRUNE' } } }, { '$group': { _id: '$_id', basketName: { '$first': '$basketName' }, items: { '$push': { dateAdded: '$items.dateAdded', itemFlavId: '$items.itemFlavId', name: '$ordered_items.name', price: '$ordered_items.price', flavId: '$ordered_items.flavours.flavId', size: '$ordered_items.flavours.size', color: '$ordered_items.flavours.color' } } } } ], {})
[
{
"_id": "5a797ef0333d8418866ebabc",
"basketName": "Default",
"items": [
{
"dateAdded": 1526996879787,
"itemFlavId": "5a797f8c768d8418866ebad3",
"name": "Supermanshirt",
"price": 9.99,
"flavId": "5a797f8c768d8418866ebad3",
"size": "M",
"color": "white"
}
]
}
]

MongoDB aggregate count of items in two arrays across different documents?

Here is my MongoDB collection schema:
company: String
model: String
cons: [String] // array of tags that were marked as "cons"
pros: [String] // array of tags that were marked as "pros"
I need to aggregate it so I get the following output:
[{
"_id": {
"company": "Lenovo",
"model": "T400"
},
"tags": {
tag: "SomeTag"
pros: 124 // number of times, "SomeTag" tag was found in "pros" array in `Lenovo T400`
cons: 345 // number of times, "SomeTag" tag was found in "cons" array in `Lenovo T400`
}
}...]
I tried to do the following:
var aggParams = {};
aggParams.push({ $unwind: '$cons' });
aggParams.push({ $unwind: '$pros' });
aggParams.push({$group: {
_id: {
company: '$company',
model: '$model',
consTag: '$cons'
},
consTagCount: { $sum: 1 }
}});
aggParams.push({$group: {
_id: {
company: '$_id.company',
model: '$_id.model',
prosTag: '$pros'
},
prosTagCount: { $sum: 1 }
}});
aggParams.push({$group: {
_id: {
company:'$_id.company',
model: '$_id.model'
},
tags: { $push: { tag: { $or: ['$_id.consTag', '$_id.prosTag'] }, cons: '$consTagCount', pros: '$prosTagCount'} }
}});
But I got the following result:
{
"_id": {
"company": "Lenovo",
"model": "T400"
},
"tags": [
{
"tag": false,
"pros": 7
}
]
}
What is the right way to do this with aggregation?
Yes this is a bit harder considering that there are multiple arrays, and if you try both at the same time you end up with a "cartesian condition" where one arrray multiplies the contents of the other.
Therefore, just combine the array content at the beginning, which probably indicates how you should be storing the data in the first place:
Model.aggregate(
[
{ "$project": {
"company": 1,
"model": 1,
"data": {
"$setUnion": [
{ "$map": {
"input": "$pros",
"as": "pro",
"in": {
"type": { "$literal": "pro" },
"value": "$$pro"
}
}},
{ "$map": {
"input": "$cons",
"as": "con",
"in": {
"type": { "$literal": "con" },
"value": "$$con"
}
}}
]
}
}},
{ "$unwind": "$data" }
{ "$group": {
"_id": {
"company": "$company",
"model": "$model",
"tag": "$data.value"
},
"pros": {
"$sum": {
"$cond": [
{ "$eq": [ "$data.type", "pro" ] },
1,
0
]
}
},
"cons": {
"$sum": {
"$cond": [
{ "$eq": [ "$data.type", "con" ] },
1,
0
]
}
}
}
],
function(err,result) {
}
)
So via the first $project stage the $map operators are adding the "type" value to each item of each array. Not that it really matters here as all items should process "unique" anyway, the $setUnion operator "contatenates" each array into a singular array.
As mentioned earlier, you probably should be storing in this way in the first place.
Then process $unwind followed by $group, wherein each "pros" and "cons" is then evaluated via $cond to for it's matching "type", either returning 1 or 0 where the match is respectively true/false to the $sum aggregation accumulator.
This gives you a "logical match" to count each respective "type" within the aggregation operation as per the grouping keys specified.

Resources