Aggregate mongodb by latest timestamp

Aggregate mongodb by latest timestamp - node.js

I'd like to get the "population" of each city's last timestamp using the aggregate function.
In a MongoDB like this:
{
"_id": {"$oid": "55354bc97b5dfd021f2be661"},
"timestamp": {"$date": "2015-04-20T18:56:09.000Z"},
"city": "Roma",
"population": [
{"age": 90,"count": 1000},
{"age": 25,"count": 25}
]
},
{
"_id": {"$oid": "55354c357b5dfd021f2be663"},
"timestamp": {"$date": "2015-04-20T18:57:57.000Z"},
"city": "Madrid",
"population": [
{"age": 90,"count": 10},
{"age": 75,"count": 2343},
{"age": 50,"count": 500},
{"age": 70,"count": 5000}
]
},
{
"_id": {"$oid": "55362da541c37aef07d4ea9a"},
"timestamp": {"$date": "2015-04-21T10:59:49.000Z"},
"city": "Roma",
"population": [
{"age": 90,"count": 5}
]
}
I'd like to retrieve all the cities, but for each one only the latest timestamp:
{
"city": "Roma",
"population": [
{"age": 90,"count": 5}
]
},
{
"city": "Madrid",
"population": [
{"age": 90,"count": 10},
{"age": 75,"count": 2343},
{"age": 50,"count": 500},
{"age": 70,"count": 5000}
]
}
I have tried something like this answer, but I don't know how to "unwind" the populations after getting the latest timestamp for each city:
db.collection('population').aggregate([
{ $unwind: '$population' },
{ $group: { _id: '$city', timestamp: { $max: '$timestamp' } } },
{ $sort: { _id : -1 } }
], function(err, results) {
res.send(results)
});

The following aggregation pipeline will give you the desired result. The first step in the pipeline orders the documents by the timestamp field (descending) and then groups the ordered documents by the city field in the next $group stage. Within the $group operator, you can extract the population array field by way of the $$ROOT operator. The $first operator returns the value that results from applying the $$ROOT expression to the first document in a group of documents that share the same city key. The final pipeline stage involves projecting the fields from the previous pipeline into the desired fields:
db.population.aggregate([
{
"$sort": { "timestamp": -1 }
},
{
"$group": {
"_id": "$city",
"doc": { "$first": "$$ROOT" }
}
},
{
"$project": {
"_id": 0,
"city": "$_id",
"population": "$doc.population"
}
}
]);
Output:
/* 0 */
{
"result" : [
{
"city" : "Madrid",
"population" : [
{
"age" : 90,
"count" : 10
},
{
"age" : 75,
"count" : 2343
},
{
"age" : 50,
"count" : 500
},
{
"age" : 70,
"count" : 5000
}
]
},
{
"city" : "Roma",
"population" : [
{
"age" : 90,
"count" : 5
}
]
}
],
"ok" : 1
}

I think that you want to use $project instead of $unwind:
db.collection('population').aggregate([{
$group: {
_id: '$city',
timestamp: {$max: '$timestamp'}
}
}, {
$project: {
population: '$doc.population'
}
}, {
$sort: {
_id : -1
}
}], function(err, results) {
res.send(results)
});

I use this to sort any timestamp field using aggregation, I am sorting it by the latest update time of the document. If you need you can group it later. You can learn more about [aggregate sorting here.][1]
aggregate.push({ $sort: { updated_at: -1 } });
What I do is I make blocks of aggregate actions push them into an array and execute it all together. I find it easier to debug if something is not working properly.
[1]: https://www.mongodb.com/docs/manual/reference/operator/aggregation/sort/

Related

Aggregate duplicate documents with values in array in Mongo

I have a large collection of documents that look as follows:
{ "_id": "5a760191813a54000b8475f1", "orders": [{ "row": "3", "seat": "11" }, { "row": "3", "seat": "12" }], "product_id": "5a7628bedbcc42000aa7f614" },
{ "_id": "5a75f6f17abe45000a3ba05e", "orders": [{ "row": "3", "seat": "12" }, { "row": "3", "seat": "13" }], "product_id": "5a7628bedbcc42000aa7f614" },
{ "_id": "5a75ebdf813a54000b8475e7", "orders": [{ "row": "5", "seat": "16" }, { "row": "5", "seat": "15" }], "product_id": "5a75f711dbcc42000c459efc" }
I need to be able to find any documents where the product_id and items in the orders array are duplicates. I can't quite seem to wrap my head around accomplishing this. Any pointers?

I don't know what output you want, but this has the information about the duplicates, maybe you want to add unwind on duplicates also.
Result documents
product_id
order (that found duplicated)
duplicates (the documents that had that order as duplicate)
For your data would print
[{
"duplicates": [
"5a760191813a54000b8475f1",
"5a75f6f17abe45000a3ba05e"
],
"order": {
"row": "3",
"seat": "12"
},
"product_id": "5a7628bedbcc42000aa7f614"
}]
Query
(run it on your driver, MongoPlayground doesn't keep the order of fields and can show wrong results)
aggregate(
[{"$unwind" : {"path" : "$orders"}},
{
"$group" : {
"_id" : {
"orders" : "$orders",
"product_id" : "$product_id"
},
"duplicates" : {
"$push" : "$_id"
}
}
},
{"$match" : {"$expr" : {"$gt" : [ {"$size" : "$duplicates"}, 1 ]}}},
{
"$project" : {
"_id" : 0,
"order" : "$_id.orders",
"product_id" : "$_id.product_id",
"duplicates" : 1
}
}
])
Data (i added some more data)
[
{
"_id": "5a760191813a54000b8475f1",
"orders": [
{
"row": "3",
"seat": "11"
},
{
"row": "3",
"seat": "12"
}
],
"product_id": "5a7628bedbcc42000aa7f614"
},
{
"_id": "5a75f6f17abe45000a3ba05g",
"orders": [
{
"row": "3",
"seat": "12"
},
{
"row": "3",
"seat": "13"
}
],
"product_id": "5a7628bedbcc42000aa7f614"
},
{
"_id": "5a75f6f17abe45000a3ba05e",
"orders": [
{
"row": "3",
"seat": "12"
},
{
"row": "3",
"seat": "13"
}
],
"product_id": "5a7628bedbcc42000aa7f614"
},
{
"_id": "5a75ebdf813a54000b8475e7",
"orders": [
{
"row": "5",
"seat": "16"
},
{
"row": "5",
"seat": "15"
}
],
"product_id": "5a75f711dbcc42000c459efc"
}
]
Results
[{
"duplicates": [
"5a75f6f17abe45000a3ba05g",
"5a75f6f17abe45000a3ba05e"
],
"order": {
"row": "3",
"seat": "13"
},
"product_id": "5a7628bedbcc42000aa7f614"
},
{
"duplicates": [
"5a760191813a54000b8475f1",
"5a75f6f17abe45000a3ba05g",
"5a75f6f17abe45000a3ba05e"
],
"order": {
"row": "3",
"seat": "12"
},
"product_id": "5a7628bedbcc42000aa7f614"
}]

You could use below query. $unwind the orders array, $group by order row and product and collect matching ids and count. Keep the documents where count is greater than 1. $lookup to pull in the matching documents by id and $replaceRoot to flatten the documents.
db.collection.aggregate([
{
"$unwind": "$orders"
},
{
"$group": {
"_id": {
"order": "$orders",
"product_id": "$product_id"
},
"count": {
"$sum": 1
},
"doc_ids": {
"$push": "$_id"
}
}
},
{
"$match": {
"count": {
"$gt": 1
}
}
},
{
"$lookup": {
"from": "collection",
"localField": "doc_ids",
"foreignField": "_id",
"as": "documents"
}
},
{
"$unwind": "$documents"
},
{
"$replaceRoot": {
"newRoot": "$documents"
}
}
])
https://mongoplayground.net/p/YbztEGttUMx

While this can be done purely in Mongo I do not recommend it as it's very very very memory inefficient. you basically have to hold the entire collection in memory the entire time while you do certain manipulations on it.
I will however show the pipeline for this because we will use it with the second more scaleable approach.
We want to $group based on orders and product_id, however there are 2 issues standing in our way.
The orders field might not be sorted the same in all documents, because Mongo does not support "nested" sorting we have to $unwind the array, $sort it and restore the original structure. ( mind you you're sorting the entire collection here in memory ). This step which is one of the pain points of this pipeline can be skipped if you can ensure sort order is maintained in the orders array.
Mongo is inconsistent when $grouping an array of objects. full disclosure I'm not entirely sure what's going on in there but I'm guessing there are some "shortcuts" done for efficiency which affects the stability somehow. So our approach will be to convert these objects into a string (concating the "row" and "seat" together).
db.collection.aggregate([
{
"$unwind": "$orders"
},
{
$sort: {
"orders.row": 1,
"orders.seat": 1
}
},
{
$group: {
_id: "$_id",
tmpOrders: {
$push: {
$concat: [
"$orders.row",
"$orders.seat"
]
}
},
product_id: {
$first: "$product_id"
}
}
},
{
$group: {
_id: {
orders: "$tmpOrders",
product: "$product_id"
},
dupIds: {
$push: "$_id"
}
}
},
{
$match: {
"dupIds.0": {
$exists: true
}
}
},
{
$project: {
_id: 0,
dups: "$dupIds",
}
}
])
Mongo Playground
Now as I said this approach is not scaleable, and on large collections will take a very long time to run. So I recommend utilizing indexes and iterating over product_id's and executing each pipeline separately.
// wraps the native Promise, not required.
import Bluebird = require('bluebird');
// very fast with index.
const productIds = await collection.distinct('product_id')
await Bluebird.map(productIds, async (productId) => {
const dups = await collection.aggregate([
{
$match: {
product_id: productId
}
}
... same pipeline ...
])
if (dups.length) {
// logic required.
}
// can control concurrency based on db workload.
}, { concurrency: 5})
Make sure with this approach you have an index built on product_id so it will work efficiently.

Mongoose aggregate to get Average rating, count each rate and return the actual ratings

Im trying to to get avg rating for a product, plus the count of each rating and also return the actual ratings and use pagination to limit amount that is returned without affecting the avg or count.
So I'm trying achieve something like this:
this is my rating collection:
{
"productId": "3"
"userid" : 5,
"rating" : 5
"comment": "this is nice"
},
{
"productId": "3"
"userid" : 2,
"rating" :4
"comment": "this is very nice"
}
and this is the end result I want
{
"_id" : 1,
"avgRating": "3.6"
"counts" : [
{
"rating" : 5,
"count" : 8
},
{
"rating" : 3,
"count" : 2
},
{
"rating" : 4,
"count" : 4
},
{
"rating" : 1,
"count" : 4
}
],
"ratings": [
{
"productId": "3"
"userid" : 5,
"rating" : 5
"comment": "this is nice"
},
{
"productId": "3"
"userid" : 2,
"rating" :4
"comment": "this is very nice"
},
{
"productId": "3"
"userid" : 12,
"rating" : 4
"comment": "this is okay"
}
]
}
I have this so far which give me the count for each rating:
db.votes.aggregate([
{ $match: { postId: {$in: [1,2]} } },
{
$group: { _id: { post: "$postId", rating: "$vote" }, count: { $sum: 1 } }
},
{
$group: {
_id: "$_id.post",
counts: { $push: { rating: "$_id.rating", count: "$count" } }
}
}
])

You're not far off, we just have to adjust some things:
db.votes.aggregate([
{
$match:
{
postId: {$in: [1, 2]}
}
},
{
$group: {
_id: {post: "$postId", rating: "$vote"},
count: {$sum: 1},
reviews: {$push : "$$ROOT" } //keep the original document
}
},
{
$group: {
_id: "$_id.post",
counts: {$push: {rating: "$_id.rating", count: "$count"}},
reviews: {$push: "$reviews"},
totalItemCount: {$sum: "$count"}, //for avg calculation
totalRating: {$sum: "$_id.rating"} // //for avg calculation
}
},
{
$project: {
_id: "$_id",
avgRating: {$divide: ["$totalRating", "$totalItemCount"]},
counts: "$counts",
reviews: {
$slice: [
{
$reduce: {
input: "$reviews",
initialValue: [],
in: { $concatArrays: ["$$value", "$$this"] }
}
},
0, //skip
10 //limit
]
}
}
}
])
Note that I preserved the current pipeline structure for clarity, however I feel that using a pipeline that utilizes $facet might be more efficient as we won't have to hold the entire collection in memory while grouping.
we'll split it into two, one the current pipeline minus the review section and one with just $skip and $limit stages.
EDIT:
$facet version:
db.votes.aggregate([
{
"$match": {
"postId": {"$in": [1, 2]}
}
},
{
"$facet": {
"numbers": [
{
"$group": {
"_id": {
"post": "$postId",
"rating": "$vote"
},
"count": {
"$sum": 1.0
}
}
},
{
"$group": {
"_id": "$_id.post",
"counts": {
"$push": {
"rating": "$_id.rating",
"count": "$count"
}
},
"totalItemCount": {
"$sum": "$count"
},
"totalRating": {
"$sum": "$_id.rating"
}
}
}
],
"reviews": [
{
"$skip": 0.0
},
{
"$limit": 10.0
}
]
}
},
{
"$unwind": "$numbers"
},
{
"$project": {
"_id": "$numbers._id",
"reviews": "$reviews",
"avgRating": {"$divide": ["$numbers.totalRating", "$numbers.totalItemCount"]},
"counts": "$numbers.counts"
}
}
]);

MongoDB group by ID and then by date

I have a collection in my MongoDB database that stores durations for people who are in groups, it looks a like this:
[{
"_id": "5c378eecd11e570240a9b0ac",
"state": "DRAFT",
"groupId": "5c378eebd11e570240a9ae49",
"personId": "5c378eebd11e570240a9aee1",
"date": "2019-01-07T00:00:00.000Z",
"duration": 480,
"__v": 0
},
{
"_id": "5c378eecd11e570240a9b0bb",
"state": "DRAFT",
"groupId": "5c378eebd11e570240a9ae58",
"personId": "5c378eebd11e570240a9aeac",
"date": "2019-01-07T00:00:00.000Z",
"duration": 480,
"__v": 0
},
{
"_id": "5c378eecd11e570240a9b0c5",
"state": "DRAFT",
"groupId": "5c378eebd11e570240a9ae3e",
"personId": "5c378eebd11e570240a9aef6",
"date": "2019-01-07T00:00:00.000Z",
"duration": 480,
"__v": 0
}]
I would like to be able to run an aggregate query which returns a collection of personIds and the duration grouped per day with the corresponding groupId, which would look like this:
[{
"personId": "5c378eebd11e570240a9aee1",
"time": [{
"date": "2019-01-07T00:00:00.000Z",
"entries": [{
"groupId": "5c378eebd11e570240a9ae49",
"duration": 480,
"state": "DRAFT"
}]
}]
}, {
"personId": "5c378eebd11e570240a9aeac",
"time": [{
"date": "2019-01-07T00:00:00.000Z",
"entries": [{
"groupId": "5c378eebd11e570240a9ae58",
"duration": 480,
"state": "DRAFT"
}]
}]
}, {
"personId": "5c378eebd11e570240a9aef6",
"time": [{
"date": "2019-01-07T00:00:00.000Z",
"entries": [{
"groupId": "5c378eebd11e570240a9ae3e",
"duration": 480,
"state": "DRAFT"
}]
}]
}]
So far, I have written the following aggregation (I'm using Mongoose, hence the syntax):
Time.aggregate()
.match({ date: { $gte: new Date(start), $lte: new Date(end) } })
.group({
_id: '$personId',
time: { $push: { date: '$date', duration: '$duration', state: '$state' } },
})
.project({ _id: false, personId: '$_id', time: '$time' })
Which returns the following:
[{
"personId": "5c378eebd11e570240a9aed1",
"time": [{
"date": "2019-01-11T00:00:00.000Z",
"duration": 480,
"state": "DRAFT"
}, {
"date": "2019-01-11T00:00:00.000Z",
"duration": 480,
"state": "DRAFT"
}
// ...
}]
Hopefully you can see that the durations are being grouped by personId but I've not been able to figure out how to apply another grouping to the time array as the dates are duplicated if a personId has more than one duration for a given date.
Is it possible to group by and ID, push to an array and then group the values in that array as an aggregation or will my application need to map/reduce the results instead?

I would suggest running two $group operations in a row:
db.time.aggregate({
// first, group all entries by personId and date
$group: {
_id: {
personId: "$personId",
date: "$date"
},
entries: {
$push: {
groupId: "$groupId",
duration: "$duration",
state: "$state"
}
}
}
}, {
// then, group previously aggregated entries by personId
$group: {
_id: "$_id.personId",
time: {
$push: {
date: "$_id.date",
entries: "$entries"
}
}
}
}, {
// finally, rename _id to personId
$project: {
_id: 0,
personId: "$_id",
time: "$time"
}
})
In Mongoose it should be something like that:
Time.aggregate()
.match({
date: {
$gte: new Date(start),
$lte: new Date(end)
}
})
.group({
_id: {
personId: '$personId',
date: '$date'
},
entries: {
$push: {
groupId: '$groupId',
duration: '$duration',
state: '$state'
}
}
})
.group({
_id: '$_id.personId',
time: {
$push: {
date: '$_id.date',
entries: '$entries'
}
}
})
.project({
_id: false,
personId: '$_id',
time: '$time'
})

db.getCollection("dummyCollection").aggregate(
[
{
"$group" : {
"_id" : "$personId",
"time" : {
"$push" : {
"date" : "$date",
"duration" : "$duration",
"state" : "$state"
}
}
}
},
{
"$project" : {
"_id" : false,
"personId" : "$_id",
"time" : "$time"
}
},
{
"$unwind" : "$time"
},
{
"$group" : {
"_id" : "$time.date",
"time" : {
"$addToSet" : "$time"
}
}
}
]
);
Use $addToSet which returns an array of all unique values that results from applying an expression to each document in a group of documents that share the same group by key.

How to distinct (count) value when value is nested in array?

I have a data sample something like this:
"diagnostics" : {
"_ID" : "554bbf7b761e06f02fef3561",
"tests" : [
{
"_id" : "59d678064e4645ec562a37e2",
"name" : "RBC",
},
{
"_id" : "59d678064e4645ec562a37e1",
"name" : "Calcium",
}
]
}
I want to get all distinct _ID and count of all test groups in with there names
which is something like this:
"_ID" : "554bbf7b761e06f02fef3561"{ {"name" : "Calcium", count :(count of Calcium)},{"name" : "RBC", count :(count of RBC)}
Thing to keep in mind are tests is inside diagnostics and contain any number of $name field it can be two or one or any number of times and I want individual count of each distinct name .
db.collection('transactions').aggregate([
{ $unwind : '$diagnostics.tests' },
{ $group : {
_id: {
"Test_Name" : '$diagnostics.tests.name',
"ID" : '$diagnostics._id'
},
test_count: { $sum: 1 }
}
}
])
and I am getting result something like this
[
{
"_id": {
"Test_Name": "Fasting Blood Sugar",
"ID": "554bbf7b761e06f02fef3561"
},
"test_count": 76
},
{
"_id": {
"Test_Name": "Fasting Blood Sugar",
"ID": "566726c35dc18d13242fffcc"
},
"test_count": 1
},
{
"_id": {
"Test_Name": "CBC - 7 Part",
"ID": "566726c35dc18d13242fffcc"
},
"test_count": 1
},
{
"_id": {
"Test_Name": "RBC",
"ID": "554bbf7b761e06f02fef3561"
},
"test_count": 1
},
{
"_id": {
"Test_Name": "Fasting Blood Sugar",
"ID": "5a2c9edfe0d0ec71aef1e526"
},
"test_count": 6
},
{
"_id": {
"Test_Name": "Calcium",
"ID": "554bbf7b761e06f02fef3561"
},
"test_count": 77
}
]
Can anybody help me with the query?

You need to use mulitple $group stages here.
First $unwind the tests and $group it by "name" and then resize it to original and lastly then $group by "diagnostics_ID" and for the tests count you can check the $size of the "tests" array.
db.collection.aggregate([
{ "$unwind": "$diagnostics.tests" },
{ "$group": {
"_id": {
"_id": "$diagnostics.tests.name",
"diagnosticID": "$diagnostics._ID"
},
"count": { "$sum": 1 }
}},
{ "$group": {
"_id": {
"_ID": "$_id.diagnosticID"
},
"tests": {
"$push": {
"name": "$_id._id",
"count": "$count"
}
}
}},
{ "$project": {
"diagnostics._ID": "$_id._ID",
"diagnostics.tests": "$tests",
"_id": 0,
"testCount": { "$size": "$tests" }
}}
])
Code snippet

Mongoose format datetime field in find query retrieving result [duplicate]

Given collection(#name: users) Structure:
{
"_id" : ObjectId("57653dcc533304a40ac504fc"),
"username" : "XYZ",
"followers" : [
{
"count" : 31,
"ts" : ISODate("2016-06-17T18:30:00.996Z")
},
{
"count" : 31,
"ts" : ISODate("2016-06-18T18:30:00.288Z")
}
]
}
I want to query this collection based on username field, and ts to be returned in 'yyyy-mm-dd' format.
Expected Output:
{
"_id" : ObjectId("57653dcc533304a40ac504fc"),
"username" : "XYZ",
"followers" : [
{
"count" : 31,
"date" : "2016-06-17"
},
{
"count" : 31,
"date" : "2016-06-18"
}
]
}
I have tried something like this:
db.users.aggregate([
{$match:{"username":"xyz"}},
{$project:{ "followers":{"count":1,
"date":"$followers.ts.toISOString().slice(0,10).replace(/-/g,'-')"
}}
}
])
But it doesn't seems to be working. Can anyone please help?
Thanks much.

Consider running an aggregation pipeline that will allow you to flatten the data list first, project the new field using the $dateToString operator, then regroup the flattened docs to get your desired result.
The above can be shown in three distinct pipelines:
db.users.aggregate([
{ "$match": { "username": "xyz" } },
{ "$unwind": "$followers" },
{
"$project": {
"username": 1,
"count": "$followers.count",
"date": { "$dateToString": { "format": "%Y-%m-%d", "date": "$followers.ts" } }
}
},
{
"$group": {
"_id": "$_id",
"username": { "$first": "$username" },
"followers": { "$push": {
"count": "$count",
"date": "$date"
}}
}
}
])
With MongoDB 3.4 and newer, you can use the new $addFields pipeline step together with $map to create the array field without the need to unwind and group:
db.users.aggregate([
{ "$match": { "username": "xyz" } },
{
"$addFields": {
"followers": {
"$map": {
"input": "$followers",
"as": "follower",
"in": {
"count": "$$follower.count",
"date": {
"$dateToString": {
"format": "%Y-%m-%d",
"date": "$$follower.ts"
}
}
}
}
}
}
}
])

The best and easiest way to do this is to transform each element in the array with the $map operator. Of course in the "in" expression, you need to use the $dateToString to convert you "date" to string using a format specifiers.
db.coll.aggregate(
[
{ "$match": { "username": "XYZ" } },
{ "$project": {
"username": 1,
"followers": {
"$map": {
"input": "$followers",
"as": "f",
"in": {
"count": "$$f.count",
"date": {
"$dateToString": {
"format": "%Y-%m-%d",
"date": "$$f.ts"
}
}
}
}
}
}}
]
)
which produces:
{
"_id" : ObjectId("57653dcc533304a40ac504fc"),
"username" : "XYZ",
"followers" : [
{
"count" : 31,
"date" : "2016-06-17"
},
{
"count" : 31,
"date" : "2016-06-18"
}
]
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Aggregate mongodb by latest timestamp - node.js

I think that you want to use $project instead of $unwind: db.collection('population').aggregate([{ $group: { _id: '$city', timestamp: {$max: '$timestamp'} } }, { $project: { population: '$doc.population' } }, { $sort: { _id : -1 } }], function(err, results) { res.send(results) });

Related

Aggregate duplicate documents with values in array in Mongo

Mongoose aggregate to get Average rating, count each rate and return the actual ratings

MongoDB group by ID and then by date

How to distinct (count) value when value is nested in array?

Mongoose format datetime field in find query retrieving result [duplicate]

Categories

Resources