Aggregate duplicate documents with values in array in Mongo

Aggregate duplicate documents with values in array in Mongo - node.js

I have a large collection of documents that look as follows:
{ "_id": "5a760191813a54000b8475f1", "orders": [{ "row": "3", "seat": "11" }, { "row": "3", "seat": "12" }], "product_id": "5a7628bedbcc42000aa7f614" },
{ "_id": "5a75f6f17abe45000a3ba05e", "orders": [{ "row": "3", "seat": "12" }, { "row": "3", "seat": "13" }], "product_id": "5a7628bedbcc42000aa7f614" },
{ "_id": "5a75ebdf813a54000b8475e7", "orders": [{ "row": "5", "seat": "16" }, { "row": "5", "seat": "15" }], "product_id": "5a75f711dbcc42000c459efc" }
I need to be able to find any documents where the product_id and items in the orders array are duplicates. I can't quite seem to wrap my head around accomplishing this. Any pointers?

I don't know what output you want, but this has the information about the duplicates, maybe you want to add unwind on duplicates also.
Result documents
product_id
order (that found duplicated)
duplicates (the documents that had that order as duplicate)
For your data would print
[{
"duplicates": [
"5a760191813a54000b8475f1",
"5a75f6f17abe45000a3ba05e"
],
"order": {
"row": "3",
"seat": "12"
},
"product_id": "5a7628bedbcc42000aa7f614"
}]
Query
(run it on your driver, MongoPlayground doesn't keep the order of fields and can show wrong results)
aggregate(
[{"$unwind" : {"path" : "$orders"}},
{
"$group" : {
"_id" : {
"orders" : "$orders",
"product_id" : "$product_id"
},
"duplicates" : {
"$push" : "$_id"
}
}
},
{"$match" : {"$expr" : {"$gt" : [ {"$size" : "$duplicates"}, 1 ]}}},
{
"$project" : {
"_id" : 0,
"order" : "$_id.orders",
"product_id" : "$_id.product_id",
"duplicates" : 1
}
}
])
Data (i added some more data)
[
{
"_id": "5a760191813a54000b8475f1",
"orders": [
{
"row": "3",
"seat": "11"
},
{
"row": "3",
"seat": "12"
}
],
"product_id": "5a7628bedbcc42000aa7f614"
},
{
"_id": "5a75f6f17abe45000a3ba05g",
"orders": [
{
"row": "3",
"seat": "12"
},
{
"row": "3",
"seat": "13"
}
],
"product_id": "5a7628bedbcc42000aa7f614"
},
{
"_id": "5a75f6f17abe45000a3ba05e",
"orders": [
{
"row": "3",
"seat": "12"
},
{
"row": "3",
"seat": "13"
}
],
"product_id": "5a7628bedbcc42000aa7f614"
},
{
"_id": "5a75ebdf813a54000b8475e7",
"orders": [
{
"row": "5",
"seat": "16"
},
{
"row": "5",
"seat": "15"
}
],
"product_id": "5a75f711dbcc42000c459efc"
}
]
Results
[{
"duplicates": [
"5a75f6f17abe45000a3ba05g",
"5a75f6f17abe45000a3ba05e"
],
"order": {
"row": "3",
"seat": "13"
},
"product_id": "5a7628bedbcc42000aa7f614"
},
{
"duplicates": [
"5a760191813a54000b8475f1",
"5a75f6f17abe45000a3ba05g",
"5a75f6f17abe45000a3ba05e"
],
"order": {
"row": "3",
"seat": "12"
},
"product_id": "5a7628bedbcc42000aa7f614"
}]

You could use below query. $unwind the orders array, $group by order row and product and collect matching ids and count. Keep the documents where count is greater than 1. $lookup to pull in the matching documents by id and $replaceRoot to flatten the documents.
db.collection.aggregate([
{
"$unwind": "$orders"
},
{
"$group": {
"_id": {
"order": "$orders",
"product_id": "$product_id"
},
"count": {
"$sum": 1
},
"doc_ids": {
"$push": "$_id"
}
}
},
{
"$match": {
"count": {
"$gt": 1
}
}
},
{
"$lookup": {
"from": "collection",
"localField": "doc_ids",
"foreignField": "_id",
"as": "documents"
}
},
{
"$unwind": "$documents"
},
{
"$replaceRoot": {
"newRoot": "$documents"
}
}
])
https://mongoplayground.net/p/YbztEGttUMx

While this can be done purely in Mongo I do not recommend it as it's very very very memory inefficient. you basically have to hold the entire collection in memory the entire time while you do certain manipulations on it.
I will however show the pipeline for this because we will use it with the second more scaleable approach.
We want to $group based on orders and product_id, however there are 2 issues standing in our way.
The orders field might not be sorted the same in all documents, because Mongo does not support "nested" sorting we have to $unwind the array, $sort it and restore the original structure. ( mind you you're sorting the entire collection here in memory ). This step which is one of the pain points of this pipeline can be skipped if you can ensure sort order is maintained in the orders array.
Mongo is inconsistent when $grouping an array of objects. full disclosure I'm not entirely sure what's going on in there but I'm guessing there are some "shortcuts" done for efficiency which affects the stability somehow. So our approach will be to convert these objects into a string (concating the "row" and "seat" together).
db.collection.aggregate([
{
"$unwind": "$orders"
},
{
$sort: {
"orders.row": 1,
"orders.seat": 1
}
},
{
$group: {
_id: "$_id",
tmpOrders: {
$push: {
$concat: [
"$orders.row",
"$orders.seat"
]
}
},
product_id: {
$first: "$product_id"
}
}
},
{
$group: {
_id: {
orders: "$tmpOrders",
product: "$product_id"
},
dupIds: {
$push: "$_id"
}
}
},
{
$match: {
"dupIds.0": {
$exists: true
}
}
},
{
$project: {
_id: 0,
dups: "$dupIds",
}
}
])
Mongo Playground
Now as I said this approach is not scaleable, and on large collections will take a very long time to run. So I recommend utilizing indexes and iterating over product_id's and executing each pipeline separately.
// wraps the native Promise, not required.
import Bluebird = require('bluebird');
// very fast with index.
const productIds = await collection.distinct('product_id')
await Bluebird.map(productIds, async (productId) => {
const dups = await collection.aggregate([
{
$match: {
product_id: productId
}
}
... same pipeline ...
])
if (dups.length) {
// logic required.
}
// can control concurrency based on db workload.
}, { concurrency: 5})
Make sure with this approach you have an index built on product_id so it will work efficiently.

Related

mongodb pull nested array of objects

I want to pull multiple objects from array.
Here is my sample collection:
Users
{
"_id": "wef324DGSshf",
"userTypes": [
{
"type": "students",
"users": [
{
"name": "John",
"age": 20
},
{
"name": "Mike",
"age": 20
},
{
"name": "Henry",
"age": 30
},
{
"name": "Henry",
"age": 40
}
]
}
]
}
I need to pull those objects where:
type: "students" and ages: [20,40]
So I have these 2 inputs: type & ages
Expected Response:
{
"_id": "wef324DGSshf",
"userTypes": [
{
"type": "students",
"users": [
{
"name": "Henry",
"age": 30
}
]
}
]
}
I have tried this query so far but it is not working:
Users.update({
"userTypes.type": "students",
"userTypes.users.age": {$in: [20, 40]},
},
{
$pull: {
"userTypes": {
"userTypes.users.$.age": {$in: [20, 40]}
}
}
});
Can anyone help me what I am doing wrong here?

Use an arrayFilters to specify the filtering for "type": "students" and normally perform $pull on age
db.collection.update({},
{
"$pull": {
"userTypes.$[ut].users": {
"age": {
$in: [
20,
40
]
}
}
}
},
{
arrayFilters: [
{
"ut.type": "students"
}
],
multi: true
})
Mongo Playground
Explanation: Check out the official doc about arrayFilters. You can think of the entries in arrayFilters as predicates. For a variable ut, it needs to have type: students. Let's go back to the $pull part. The predicate is applied to userTypes. That means for an entry in userTypes, ut, it needs to fit in the predicate of type: students. At the same time, we are $pulling the entries that age is in [20, 40].

How to use $match after $lookup and $unwind so that it can check all the fields including the array for a keyword/string?

I have collections named
products
`
{
"_id": {
"$oid": "1"
},
"companyId": [
{
"$oid": "2"
}
],
"Title": "abcd",
"Caption": "abc",
},{
"_id": {
"$oid": "2"
},
"companyId": [
{
"$oid": "3"
}
],
"Title": "milk",
"Caption": "aa",
}
`
companies
`
{
"_id": {
"$oid": "2"
},
"name": "cathub",
"url": "cathub.com",
"__v": 0
},
"_id": {
"$oid": "3"
},
"name": "Amule",
"url": "amule.com",
"__v": 0
`
here the products collection have companyId as foreign key of _id from companies collection.What i need is that when i search for a perticular string in products,it needed to search all fields including companies which is joined.for example if my keyword is "Amule",then it needed to search in title and caption and companies.name also.if it found matching then we need to return the products document of _id:2
I tried with the following
{ $lookup:{ form:"companies", localField:"companyId", foriegnField:"_id", as :"result" } }
then
{ $unwind:{ path:"$result" } }
but i am not able to perform $match after that.Because it shows error and only allow to use $match
only in the begining.Please help to solve this issue(i need to solve this issue using TEXT index)
Complete query
model.aggregate([
{
$match: {
$text: {
$search: "Amul",
},
},
},
{
$lookup: {
from: "companies",
localField: "companyId",
foreignField: "_id",
as: "company",
},
},
{
$unwind: "$company",
},
{
$match: {
$text:{
$search: "Amul"}
},
},
},
$group: {
_id: {
_id: "$_id",
Title: "$Title",
},
comapany: {
$push: "$company",
},
},
]}
if the string 'Amul'present in any field of "products" then return the document or 'Amul' is present in the "name" field of joined "company" joined using $lookup then also return the parent document
note:-
'model' is the products collection

Mongodb aggregation to pass both a matched array and an unmatched array

I've got a MongoDB / Nodes aggregation that looks a little like this (there are other values in there, but this is the basic idea).
[
{
'$unwind': {
'path': '$Vehicles'
}
},
{
'$match': {
'Vehicles.Manufacturer': 'FORD'
}
},
{
'$facet': {
'makes': [
{
'$group': {
'_id': '$Vehicles.Manufacturer',
'count': {
'$sum': 1
}
}
}
]
}
},
{
'$project': {
'makes': {
'$sortArray': {
'input': '$makes',
'sortBy': 1
}
}
}
}
]
This works fine. But I would also like to pass an unmatched list through. IE an an array of vehicles whose Manufacturer = FORD and an other list of all Manufacturer.
Can't get it to work. Any ideas please?
Thanks in advance.
Edit:-
The current output looks like this:
[{
"makes": [
{
"_id": "FORD",
"count": 285
}
]
}]
and ideally it would look something like this:
[{
"makes": [
{
"_id": "FORD",
"count": 285
}
],
"unfiltered_makes": [
{
"_id": "ABARTH",
"count": 1
},
{
"_id": "AUDI",
"count": 7
},
{
"_id": "BMW",
"count": 2
},
{
"_id": "CITROEN",
"count": 4
},
{
"_id": "DS",
"count": 1
},
{
"_id": "FIAT",
"count": 1
}.... etc
]
}]
The data looks a bit like this:
"Vehicles": [
{
"Id": 1404908,
"Manufacturer": "MG",
"Model": "3",
"Price": 11995 .... etc
},{
"Id": 1404909,
"Manufacturer": "FORD",
"ManufacturerId": 34,
"Model": "Focus",
"Price": 12000 .... etc
} ... etc
]

In this case you can do something like:
db.collection.aggregate([
{$unwind: "$Vehicles"},
{$group: {
_id: "$Vehicles.Manufacturer",
count: {$sum: 1}}
},
{$facet: {
makes: [{$match: {_id: "FORD"}}],
unfiltered_makes: [{$group: {_id: 0, data: {$push: "$$ROOT"}}}]
}
},
{$project: {makes: 1, unfiltered_makes: "$unfiltered_makes.data"}}
])
See how it works on the playground example
Another option is:
db.collection.aggregate([
{$unwind: "$Vehicles"},
{$group: {
_id: "$Vehicles.Manufacturer",
count: {$sum: 1}}
},
{$group: {
_id: 0,
unfiltered_makes: {$push: "$$ROOT"},
makes: {$push: {$cond: [{$eq: ["$_id", "FORD"]}, "$$ROOT", "$$REMOVE"]}}
}
}
])
See how it works on the playground example

Here's another way to do it using "$function" to generate a histogram of "Manufacturer" and format the returned array. The javascript function only traverses the "Vehicles" array once, so this may be fairly efficient, although I did not do algorithm timing comparisons on a large collection.
N.B.: I'm a javascript noob and there may be a better way to do this.
db.collection.aggregate([
{
"$set": {
"unfiltered_makes": {
"$function": {
// generate histogram of manufacturers and format output
"body": "function(makes) {const m = new Object();makes.forEach((elem) => {m[elem.Manufacturer] = m[elem.Manufacturer] + 1 || 1});return Object.entries(m).map(([make, count]) => {return {'_id':make, 'count':count}})}",
"args": ["$Vehicles"],
"lang": "js"
}
}
}
},
{
"$project": {
"_id": 0,
"unfiltered_makes": 1,
"makes": {
"$filter": {
"input": "$unfiltered_makes",
"as": "make",
"cond": {
"$eq": [
"$$make._id",
// your search "Manufacturer" goes here
"FORD"
]
}
}
}
}
}
])
Try it on mongoplayground.net.

How to "sort" nested object array on MongoDB, so all items with a specific value are the firsts?

I have this data and I want to sort it by two fields:
first by specific address (details.address), for example 'Tel Aviv'.
second by regular sort, by details.cost field.
here is my data:
[{
"_id": "123",
"details": [{
"_id": "1",
"address": "Ramat Gan",
"cost": "50"
}, {
"_id": "2",
"address": "Tel Aviv",
"cost": "30"
}]
},
{
"_id": "456",
"details": [{
"_id": "4",
"address": "Modi'in",
"cost": "40"
}, {
"_id": "5",
"address": "Tel Aviv",
"cost": "20"
}]
}
]
and I want to get this data after the two sorting:
[{
"_id": "456",
"details": [{
"_id": "5",
"address": "Tel Aviv",
"cost": "20"
}, {
"_id": "4",
"address": "Modi'in",
"cost": "40"
}, {
"_id": "123",
"details": [{
"_id": "2",
"address": "Tel Aviv",
"cost": "30"
}, {
"_id": "1",
"address": "Ramat Gan",
"cost": "50"
}]
}]
}]
actually, I want to sort by my specific value address' (in this case - 'Tel Aviv') cost

If you want both splitting and sorting by cost you can expand #BuzzMoschetti's solution $group part to use $cond:
db.collection.aggregate([
{$unwind: "$details"},
{$sort: {"details.cost": 1}},
{
$group: {
_id: "$_id",
top: {
$push: {
$cond: [{$eq: ["$details.address", "Tel Aviv"]}, "$details", "$$REMOVE"]
}
},
bottom: {
$push: {
$cond: [{$ne: ["$details.address", "Tel Aviv"]}, "$details", "$$REMOVE"]
}
}
}
},
{$project: {details: {$concatArrays: ["$top", "$bottom"]}}}
])
See how it works on the playground example both
In case you to just order by specific address first:
db.collection.aggregate([
{
$project: {
top: {
$filter: {
input: "$details",
as: "item",
cond: {$eq: ["$$item.address", "Tel Aviv"]}
}
},
bottom: {
$filter: {
input: "$details",
as: "item",
cond: {$ne: ["$$item.address", "Tel Aviv"]}
}
}
}
},
{
$project: {
details: {$concatArrays: ["$top", "$bottom"]}
}
}
])
See how it works on the playground example top-city

Pretty straightforward: $unwind then re-$group. When sorting arrays of things across document boundaries you pretty much have no choice but to use $unwind to let $sort work properly.
db.foo.aggregate([
{$unwind: '$details'}
,{$sort: {'details.address':-1,'details.cost':1}}
// Rebuild the original doc; $push will *preserve* the sorted
// order of address+cost following from the stage above:
,{$group: {_id:'$_id', details: {$push: '$details'}}}
]);

MongoDB aggregate count of items in two arrays across different documents?

Here is my MongoDB collection schema:
company: String
model: String
cons: [String] // array of tags that were marked as "cons"
pros: [String] // array of tags that were marked as "pros"
I need to aggregate it so I get the following output:
[{
"_id": {
"company": "Lenovo",
"model": "T400"
},
"tags": {
tag: "SomeTag"
pros: 124 // number of times, "SomeTag" tag was found in "pros" array in `Lenovo T400`
cons: 345 // number of times, "SomeTag" tag was found in "cons" array in `Lenovo T400`
}
}...]
I tried to do the following:
var aggParams = {};
aggParams.push({ $unwind: '$cons' });
aggParams.push({ $unwind: '$pros' });
aggParams.push({$group: {
_id: {
company: '$company',
model: '$model',
consTag: '$cons'
},
consTagCount: { $sum: 1 }
}});
aggParams.push({$group: {
_id: {
company: '$_id.company',
model: '$_id.model',
prosTag: '$pros'
},
prosTagCount: { $sum: 1 }
}});
aggParams.push({$group: {
_id: {
company:'$_id.company',
model: '$_id.model'
},
tags: { $push: { tag: { $or: ['$_id.consTag', '$_id.prosTag'] }, cons: '$consTagCount', pros: '$prosTagCount'} }
}});
But I got the following result:
{
"_id": {
"company": "Lenovo",
"model": "T400"
},
"tags": [
{
"tag": false,
"pros": 7
}
]
}
What is the right way to do this with aggregation?

Yes this is a bit harder considering that there are multiple arrays, and if you try both at the same time you end up with a "cartesian condition" where one arrray multiplies the contents of the other.
Therefore, just combine the array content at the beginning, which probably indicates how you should be storing the data in the first place:
Model.aggregate(
[
{ "$project": {
"company": 1,
"model": 1,
"data": {
"$setUnion": [
{ "$map": {
"input": "$pros",
"as": "pro",
"in": {
"type": { "$literal": "pro" },
"value": "$$pro"
}
}},
{ "$map": {
"input": "$cons",
"as": "con",
"in": {
"type": { "$literal": "con" },
"value": "$$con"
}
}}
]
}
}},
{ "$unwind": "$data" }
{ "$group": {
"_id": {
"company": "$company",
"model": "$model",
"tag": "$data.value"
},
"pros": {
"$sum": {
"$cond": [
{ "$eq": [ "$data.type", "pro" ] },
1,
0
]
}
},
"cons": {
"$sum": {
"$cond": [
{ "$eq": [ "$data.type", "con" ] },
1,
0
]
}
}
}
],
function(err,result) {
}
)
So via the first $project stage the $map operators are adding the "type" value to each item of each array. Not that it really matters here as all items should process "unique" anyway, the $setUnion operator "contatenates" each array into a singular array.
As mentioned earlier, you probably should be storing in this way in the first place.
Then process $unwind followed by $group, wherein each "pros" and "cons" is then evaluated via $cond to for it's matching "type", either returning 1 or 0 where the match is respectively true/false to the $sum aggregation accumulator.
This gives you a "logical match" to count each respective "type" within the aggregation operation as per the grouping keys specified.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Aggregate duplicate documents with values in array in Mongo - node.js

Related

mongodb pull nested array of objects

How to use $match after $lookup and $unwind so that it can check all the fields including the array for a keyword/string?

Mongodb aggregation to pass both a matched array and an unmatched array

How to "sort" nested object array on MongoDB, so all items with a specific value are the firsts?

MongoDB aggregate count of items in two arrays across different documents?

Categories

Resources