Mongodb: index on subdocument array - node.js

So i have this structure of a document:
{
_id: "123abc",
mainProps: [
{
"countrycode": "US"
},
{
"yearfounded": "2011"
},
{
"city": "New York"
},
...
],
otherProps: [{}, {}, ...]
}
I have an index set like this:
db.companies.ensureIndex({mainProps: 1})
The task is to create a webform for searching in these documents. Fields in the form are not fixed and can be added. Basically i don't know how what fields user would like to filter on, so I can't set proper compound index. Database will be more then 20mil documents, now it's about 10mil.
The problem is that my index doesn't work, or works wrong.
See some examples.
This query has no index at all.
db.companies.find({'mainProps.yearfounded': '2012'}).explain()
This query uses index and is fine.
db.companies.find({mainProps:{'yearfounded': '2012'}}).explain()
And something like this hangs (if I remove the explain() ) and I don't know whether it's executing or what's happening.
db.companies.find(
{$or: [
{ mainProps: {foundedyear: '2012'}},
{ mainProps: {foundedyear: '2011'}},
]}
).explain()
For the last query explain I got something like this.
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "leadsbase.companies",
"indexFilterSet" : false,
"parsedQuery" : {
"$or" : [
{
"mainProps" : {
"$eq" : {
"foundedyear" : "2012"
}
}
},
{
"mainProps" : {
"$eq" : {
"foundedyear" : "2011"
}
}
}
]
},
"winningPlan" : {
"stage" : "SUBPLAN",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"mainProps" : 1
},
"indexName" : "mainProps_1",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 1,
"direction" : "forward",
"indexBounds" : {
"mainProps" : [
"[{ foundedyear: \"2011\
" }, { foundedyear: \"2011\" }]",
"[{ foundedyear: \"2012\
" }, { foundedyear: \"2012\" }]"
]
}
}
}
},
"rejectedPlans" : [ ]
},
"serverInfo" : {
"host" : "vm1",
"port" : 27017,
"version" : "3.2.8",
"gitVersion" : "ed70e33130c977bda0024c125b56d159573dbag0"
},
"ok" : 1
}
So as far as I understand index is present, but for some reason not working.
How should I structure my fields or how should I set indexes for this?

createIndex() will create indexes on collections whereas ensureIndex() creates an index on the specified field if the index does not already exist.
so your second query works while the firstQuery fails. try to delete your index with dropIndex(), and then rebuild the index with createIndex()
One way to check the performance o your indexscan , you can check for "executionStats"
db.collection.explain("executionStats").find( <your query>)
and then from the result, check for this two field:
executionSuccess.totalKeysExamined, executionSuccess.totalDocsExamined
For most case, if your index is good, both should have same number. Or you can read more in documentation
"executionStats" : {
"executionSuccess" : <boolean>,
"nReturned" : <int>,
"executionTimeMillis" : <int>,
"totalKeysExamined" : <int>, // this is your index keys
"totalDocsExamined" : <int>, // this is total docs examined
"executionStages" : {
"stage" : <STAGE1>
"nReturned" : <int>,
"executionTimeMillisEstimate" : <int>,
"works" : <int>,
"advanced" : <int>,
"needTime" : <int>,
"needYield" : <int>,
"isEOF" : <boolean>,
...
"inputStage" : {
"stage" : <STAGE2>,
...
"nReturned" : <int>,
"executionTimeMillisEstimate" : <int>,
"keysExamined" : <int>,
"docsExamined" : <int>,
...
"inputStage" : {
...
}
}
},

Related

Mongo update to many query

does anyone know how to change a propertie value in mongo
from
i changed JSON.units[0].services[0].label from etiket controle to AuditLabel
i tried this:
db.getCollection('assortiment').updateMany({"units.$[].services.$[].label": "etiket controle"},{ "$set": { "units.$[].services.$[].label": "AuditLabel" }})
but no succes because it dont select anything
{
"_id" : "764",
"meta" : {
"groupsId" : "764",
"type" : "DRYGR"
},
"units" : [
{
"unit" : "BASE_UNIT_OR_EACH",
"gtin" : "08711728556206",
"services" : [
{
"label" : "etiket controle",
"collection" : "gtins"
}
]
}
]
}
to
{
"_id" : "764",
"meta" : {
"groupsId" : "764",
"type" : "DRYGR"
},
"units" : [
{
"unit" : "BASE_UNIT_OR_EACH",
"gtin" : "08711728556206",
"services" : [
{
"label" : "AuditLabel",
"collection" : "gtins"
}
]
}
]
}
Try this code its work's for me
db.getCollection('assortiment').updateMany(
{
"units.services.label": "etiket controle"
},
{
"$set":
{ "units.$[].services.$[].label": "AuditLabel" }
}
)

High load on simple Mongo find with indices

I have a mongoDB I'm querying using NodeJS (running mongoose).
In this particular case I'm querying a bunch of collections and pipe the data as CSV into archiverjs to create a zip file. So I have an incoming request, the data gets queried using mongoose and a mongo cursor, piped into a pipeline which will end in archiverjs respectively the http response delivering the zip file to the user.
async function getSortedQueryCursor(...) {
...
const query = MODEL_LOOKUP[fileType]
.find(reducer)
.sort({ [idString]: 'asc' });
return query.cursor();
}
async function getData(...) {
const cursor = await getSortedQueryCursor(...);
return cursor
.pipe(filter1Stream)
.pipe(filter2Stream)
.pipe(filter3Stream)
.pipe(csvStringifyStream);
}
router.post('/:scenarioId', async (request, response) => {
...
const archive = Archiver(...);
archive.pipe(response);
const result = await getData(...);
archive.append(stream, { name: filepath });
return archive.finalize();
}
As soon as a particular collection is in the game (the collection holds roughly 40 million documents) the query lasts very long (>15s) and I can see the mongo process on 100% CPU during that time. Even more surprising as the result set is empty (no documents matching the query).
It's a rather simple query:
items.find({ scenarioId: 'ckqf5ulg38gu208eecxlf95fc' }, { sort: { dataId: 1 }
I have indices on scenarioId and dataId. If I run the query on the shell it returns in 30ms.
An explain() results in:
[
{
"queryPlanner": {
"plannerVersion": 1,
"namespace": "data.items",
"indexFilterSet": false,
"parsedQuery": {
"scenarioId": {
"$eq": "ckqf5ulg38gu208eecxlf95fc"
}
},
"winningPlan": {
"stage": "SORT",
"sortPattern": {
"itemId": 1
},
"memLimit": 104857600,
"type": "simple",
"inputStage": {
"stage": "FETCH",
"inputStage": {
"stage": "IXSCAN",
"keyPattern": {
"scenarioId": 1
},
"indexName": "scenarioId_1",
"isMultiKey": false,
"multiKeyPaths": {
"scenarioId": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"scenarioId": [
"[\"ckqf5ulg38gu208eecxlf95fc\", \"ckqf5ulg38gu208eecxlf95fc\"]"
]
}
}
}
},
"rejectedPlans": [
...
]
},
"executionStats": {
"executionSuccess": true,
"nReturned": 0,
"executionTimeMillis": 0,
"totalKeysExamined": 0,
"totalDocsExamined": 0,
"executionStages": {
"stage": "SORT",
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"works": 3,
"advanced": 0,
"needTime": 1,
"needYield": 0,
"saveState": 0,
"restoreState": 0,
"isEOF": 1,
"sortPattern": {
"dataId": 1
},
"memLimit": 104857600,
"type": "simple",
"totalDataSizeSorted": 0,
"usedDisk": false,
"inputStage": {
"stage": "FETCH",
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"works": 1,
"advanced": 0,
"needTime": 0,
"needYield": 0,
"saveState": 0,
"restoreState": 0,
"isEOF": 1,
"docsExamined": 0,
"alreadyHasObj": 0,
"inputStage": {
"stage": "IXSCAN",
"nReturned": 0,
"executionTimeMillisEstimate": 0,
"works": 1,
"advanced": 0,
"needTime": 0,
"needYield": 0,
"saveState": 0,
"restoreState": 0,
"isEOF": 1,
"keyPattern": {
"scenarioId": 1
},
"indexName": "scenarioId_1",
"isMultiKey": false,
"multiKeyPaths": {
"scenarioId": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"scenarioId": [
"[\"ckqf5ulg38gu208eecxlf95fc\", \"ckqf5ulg38gu208eecxlf95fc\"]"
]
},
"keysExamined": 0,
"seeks": 1,
"dupsTested": 0,
"dupsDropped": 0
}
}
},
...
},
"serverInfo": {
...
"version": "4.4.6",
"gitVersion": "72e66213c2c3eab37d9358d5e78ad7f5c1d0d0d7"
},
...
}
]
It tells me (I'm not very experienced in interpreting those results) that the query is quite cheap: "executionTimeMillisEstimate": 0, as it's not running a document scan "docsExamined": 0,.
Next I connected to the mongo server and ran db.currentOp({"secs_running": {$gte: 5}}) to get some information from this side:
{
"type" : "op",
...
"clientMetadata" : {
"driver" : {
"name" : "nodejs|Mongoose",
"version" : "3.6.5"
},
"os" : {
"type" : "Linux",
"name" : "linux",
"architecture" : "x64",
"version" : "5.8.0-50-generic"
},
"platform" : "'Node.js v14.17.0, LE (unified)",
"version" : "3.6.5|5.12.3"
},
"active" : true,
"secs_running" : NumberLong(16),
"microsecs_running" : NumberLong(16661409),
"op" : "query",
"ns" : "data.items",
"command" : {
"find" : "items",
"filter" : {
"scenarioId" : "ckqf5ulg38gu208eecxlf95fc"
},
"sort" : {
"itemId" : 1
},
"projection" : {
},
"returnKey" : false,
"showRecordId" : false,
"lsid" : {
"id" : UUID("be3ce18b-5365-4680-b734-543d06418301")
},
"$clusterTime" : {
"clusterTime" : Timestamp(1625498044, 1),
"signature" : {
"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
"keyId" : 0
}
},
"$db" : "data",
"$readPreference" : {
"mode" : "primaryPreferred"
}
},
"numYields" : 14701,
"locks" : {
"ReplicationStateTransition" : "w",
"Global" : "r",
"Database" : "r",
"Collection" : "r"
},
"waitingForLock" : false,
"lockStats" : {
"ReplicationStateTransition" : {
"acquireCount" : {
"w" : NumberLong(14702)
}
},
"Global" : {
"acquireCount" : {
"r" : NumberLong(14702)
}
},
"Database" : {
"acquireCount" : {
"r" : NumberLong(14702)
}
},
"Collection" : {
"acquireCount" : {
"r" : NumberLong(14702)
}
},
"Mutex" : {
"acquireCount" : {
"r" : NumberLong(1)
}
}
},
"waitingForFlowControl" : false,
"flowControlStats" : {
}
}
Any ideas how to improve the performance or find the bottleneck in my application? As the load is high on mongo side and no documents are found/passed to the application I guess it's mongo having trouble ...
EDIT: I've logged the whole process from DB side using db.setProfilingLevel(2) and db.system.profile.find().pretty(). Here we can see that the whole collection (or am I misinterpreting "docsExamined" : 39612167?) is queried:
{
"op" : "query",
"ns" : "data.items",
"command" : {
"find" : "items",
"filter" : {
"scenarioId" : "ckqf5ulg38gu208eecxlf95fc"
},
"sort" : {
"dataId" : 1
},
"projection" : {
},
...
"$db" : "data",
"$readPreference" : {
"mode" : "primaryPreferred"
}
},
"keysExamined" : 39612167,
"docsExamined" : 39612167,
"cursorExhausted" : true,
"numYield" : 39613,
"nreturned" : 0,
"queryHash" : "B7F40289",
"planCacheKey" : "BADED068",
"locks" : {
"ReplicationStateTransition" : {
"acquireCount" : {
"w" : NumberLong(39615)
}
},
"Global" : {
"acquireCount" : {
"r" : NumberLong(39615)
}
},
"Database" : {
"acquireCount" : {
"r" : NumberLong(39614)
}
},
"Collection" : {
"acquireCount" : {
"r" : NumberLong(39614)
}
},
"Mutex" : {
"acquireCount" : {
"r" : NumberLong(1)
}
}
},
"flowControl" : {
},
"storage" : {
},
"responseLength" : 242,
"protocol" : "op_msg",
"millis" : 48401,
"planSummary" : "IXSCAN { dataId: 1 }",
"execStats" : {
"stage" : "CACHED_PLAN",
"nReturned" : 0,
"executionTimeMillisEstimate" : 48401,
"works" : 1,
"advanced" : 0,
"needTime" : 0,
"needYield" : 0,
"saveState" : 39613,
"restoreState" : 39613,
"isEOF" : 1,
"inputStage" : {
"stage" : "FETCH",
"filter" : {
"scenarioId" : {
"$eq" : "ckqf5ulg38gu208eecxlf95fc"
}
},
"nReturned" : 0,
"executionTimeMillisEstimate" : 6270,
"works" : 39612168,
"advanced" : 0,
"needTime" : 39612167,
"needYield" : 0,
"saveState" : 39613,
"restoreState" : 39613,
"isEOF" : 1,
"docsExamined" : 39612167,
"alreadyHasObj" : 0,
"inputStage" : {
"stage" : "IXSCAN",
"nReturned" : 39612167,
"executionTimeMillisEstimate" : 2151,
"works" : 39612168,
"advanced" : 39612167,
"needTime" : 0,
"needYield" : 0,
"saveState" : 39613,
"restoreState" : 39613,
"isEOF" : 1,
"keyPattern" : {
"dataId" : 1
},
"indexName" : "dataId_1",
"isMultiKey" : false,
"multiKeyPaths" : {
"dataId" : [ ]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"dataId" : [
"[MinKey, MaxKey]"
]
},
"keysExamined" : 39612167,
"seeks" : 1,
"dupsTested" : 0,
"dupsDropped" : 0
}
}
}
(As usual) it seems like the indices are not properly set. I've created a new (secondary?) index:
{
"dataId" : 1,
"scenarioId": 1
}
and now the query returns within milliseconds ...
EDIT: What still makes me wonder is that the shell query returned in milliseconds and the mongoose query took ages. Even though the queries seem to be identical (from my point of view) mongo treats them differently.

Why is sort functionality not using the index I created (even with the use of $hint)

I have created and ascending and descending index for a
db.getCollection('objectlists').createIndex( { a: 1 } )
db.getCollection('objectlists').createIndex( { a: -1 } )
When I use this index in the find functionality, it works perfectly even on large amount of data
db.getCollection('objectlists').find({a: {$gt: 0}}) --> returns instantly.
However, when I use it for sort such as:
db.getCollection('objectlists').find().sort({a: 1}), I get:
Error: error: {
"ok" : 0,
"errmsg" : "Executor error during find command :: caused by :: Sort operation used more than the maximum 33554432 bytes of RAM. Add an index, or specify a smaller limit.",
"code" : 96,
"codeName" : "OperationFailed"
}
I have even tried adding hint:
db.getCollection('objectlists').find().sort({a: 1}).hint({a: 1});
But I end up getting the same error. I tried also using ensureIndex() before running this query, but still did not help solve the error. What is the issues? Am I misunderstanding how index sorting works?
The output of db.getCollection('objectlists').find().sort({a: 1}).explain() is
{
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "App.objectlists",
"indexFilterSet" : false,
"parsedQuery" : {},
"winningPlan" : {
"stage" : "SORT",
"sortPattern" : {
"a" : 1.0
},
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "FETCH",
"inputStage" : {
"stage" : "IXSCAN",
"keyPattern" : {
"a" : 1
},
"indexName" : "a_1",
"isMultiKey" : true,
"multiKeyPaths" : {
"a" : [
"a"
]
},
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : 2,
"direction" : "forward",
"indexBounds" : {
"a" : [
"[MinKey, MaxKey]"
]
}
}
}
}
},
"rejectedPlans" : []

Need to apply two group in sequence and second group should will have effect on result of first group

I want to group my data on the base of factoryId field and then each factory there will be multiple orders want to again group on basis of orderId as each order can contain multiple items. Here I am giving the example of my data and what I need and first group by which I tried.
{
"_id" : ObjectId("5b3e270c42d8004cea382e87"),
"factoryId" : ObjectId("5aa76190cef23a1561b8056c"),
"productId" : ObjectId("5aa78c66cef23a1561b80893"),
"orderId" : ObjectId("5b3e270c42d8004cea382e86"),
"generatedOrderId" : "3985-166770-4554",
"productName" : "Lakme Lotion"
},
{
"_id" : ObjectId("5b3e270c42d8004cea382e88"),
"factoryId" : ObjectId("5b39aed32832f72062e51c23"),
"productId" : ObjectId("5b3cb96139cec8341df52c4b"),
"orderId" : ObjectId("5b3e270c42d8004cea382e86"),
"generatedOrderId" : "3985-166770-4554",
"productName" : "Coke"
},
{
"_id" : ObjectId("5b3e27b07fe0d94d62b76b2a"),
"factoryId" : ObjectId("5aa76190cef23a1561b8057c"),
"productId" : ObjectId("5ac21075ac347a5fbf355028"),
"orderId" : ObjectId("5b3e27b07fe0d94d62b76b27"),
"generatedOrderId" : "3985-755507-7484",
"productName" : "Spoon"
}
And I want result as:
{
"factoryId":ObjectId("5aa76190cef23a1561b8057c"),
"orders":[
{
"orderId":ObjectId("5b3e270c42d8004cea382e86")
"items":[
{
"productName":"Lakme Lotion"
},
{
"productName":"Coke"
}
]
}
]
}
Can anyone help me with this?. Any help is appreciated.
I tried and It worked for me. Sorry
db.getCollection("transactions").aggregate(
[
{
"$group" : {
"_id" : "$orderId",
"items" : {
"$push" : "$$ROOT"
}
}
},
{
"$project" : {
"orderId" : "$_id",
"items" : "$items",
"_id" : 0
}
},
{
"$unwind" : {
"path" : "$items",
"preserveNullAndEmptyArrays" : false
}
},
{
"$group" : {
"_id" : "$items.factoryId",
"orders" : {
"$push" : "$$ROOT"
}
}
},
{
"$project" : {
"factoryId" : "$_id",
"orders" : "$orders",
"_id" : 0
}
}
]
);

How can I check if my aggregate query is good or bad?

How can I check if my aggregate query is good or bad? I already use "explain" but it's not specific.
Here's the output of my aggregate "explain".
{
"waitedMS" : NumberLong(0),
"stages" : [
{
"$cursor" : {
"query" : {
"keys.license_id" : ObjectId("580eeb7fb79bec95648775a2"),
"deleted.status" : {
"$ne" : 1
}
},
"queryPlanner" : {
"plannerVersion" : 1,
"namespace" : "serpentsmsapp.conversations",
"indexFilterSet" : false,
"parsedQuery" : {
"$and" : [
{
"keys.license_id" : {
"$eq" : ObjectId("580eeb7fb79bec95648775a2")
}
},
{
"$not" : {
"deleted.status" : {
"$eq" : 1
}
}
}
]
},
"winningPlan" : {
"stage" : "COLLSCAN",
"filter" : {
"$and" : [
{
"keys.license_id" : {
"$eq" : ObjectId("580eeb7fb79bec95648775a2")
}
},
{
"$not" : {
"deleted.status" : {
"$eq" : 1
}
}
}
]
},
"direction" : "forward"
},
"rejectedPlans" : [ ]
}
}
},
{
"$sort" : {
"sortKey" : {
"_id" : 1,
"keys.license_id" : 1,
"status.deleted" : 1
}
}
},
{
"$lookup" : {
"from" : "conversation_messages",
"as" : "cmf",
"localField" : "_id",
"foreignField" : "keys.conv_id",
"unwinding" : {
"preserveNullAndEmptyArrays" : false
}
}
},
{
"$match" : {
"cmf.deleted.status" : 0
}
},
{
"$sort" : {
"sortKey" : {
"cmf.deleted.status" : -1
}
}
},
{
"$group" : {
"_id" : {
"id" : "$_id",
"number" : "$number",
"mode" : "$mode",
"keys" : "$keys",
"ports" : "$ports",
"user_assign" : "$user_assign",
"spam" : "$spam"
},
"cm_field" : {
"$last" : {
"id" : "$cmf._id",
"message" : "$cmf.message",
"ports" : "$cmf.ports",
"mode" : "$cmf.mode",
"keys" : "$cmf.keys",
"status" : "$cmf.status",
"date" : "$cmf.updated"
}
},
"counts" : {
"$sum" : "$cmf.status"
},
"sms_mode" : {
"$addToSet" : "$cmf.mode"
}
}
},
{
"$sort" : {
"sortKey" : {
"cm_field.date" : -1
},
"limit" : NumberLong(5000)
}
},
{
"$group" : {
"_id" : "$_id",
"cm_field" : {
"$last" : "$cm_field"
},
"counts" : {
"$first" : "$counts"
},
"sms_mode" : {
"$first" : "$sms_mode"
}
}
}
],
"ok" : 1
}
How can I see how many docs are scanned before performing my query?
In order to solve your problem I suggest you the following plan
Learn how does Explain work.
"Good or bad" approach is not an engineering approach, it always depends on a lot of factors (requirements, hardware, etc...). You should define what are those requirements before answering if it query is performant enough.
For your specific explain log. "COLSCAN" - means it has used all your documents on the collection to aggregate the result, this usually is an alert sign, you should think of using proper indexes.

Resources