Is there a limit to simultaneous Mongo inserts - node.js

I have a collection of Codes that I am populating from a CSV. There are a total of 1.5M codes, quite a few. The CSV is readily parsed into an object or the form:
codes = [
{code:'abc'},
{code:'123'},
etc
]
I initially tried writing this to Mongo in one insert, like so
Code.collection.insert(codes)
(using Mongoose to compose the query directly).
However this failed silently. Assuming some kind of hidden memory issue I began chunking my code, and found that Mongo 2.6 (running locally on my 16Gb Macbook, no replica sets) would accept around 1000 codes in a single insert.
Is this expected behaviour, and if so, is there any rationale to this number?

Try inserting using the Bulk Operation Methods, in particular you would need the db.collection.initializeOrderedBulkOp() method which can be exposed within Mongoose using the Model.collection accessor object. Thus, in the above you could restructure your inserts to do a bulk update as follows:
var bulk = Code.collection.initializeOrderedBulkOp(),
counter = 0;
codes.forEach( function(obj) ) {
bulk.find({'code': {'$ne': null}}/* some search */)
.update({'$set': {'code': obj.code}});
counter++;
if (counter % 1000 == 0) {
bulk.execute(function(err, result) {
bulk = Code.collection.initializeOrderedBulkOp();
});
}
}
if (counter % 1000 != 0 ) {
bulk.execute(function(err, result) {
// get stats
});
}

Related

How do I iterate of the object containing queries, and execute them all

I have an object that looks like this:
let queries = [
{
name: "checkUsers",
query: 'select * from users where inactive = 1'
},
{
name: "checkSubscriptions",
query: 'select * from subscriptions where invalid = 1'
}
]
I am making an AWS Lambda function that will iterate these queries, and if any of them returns a value, I will send an email.
I have come up with this pseudo code:
for (let prop in queries) {
const result = await mysqlConnector.runQuery(prop.query).catch(async error => {
// handle error in query
});
if (result.length < 0){
// send email
}
}
return;
I am wondering is this ideal approach? I need to iterate all the object queries.
I don't see anything wrong with what you are trying to achieve but there are few changes you could do
Try to use Promise.all if you can. This will speed up the overall process as things will execute in parallel. It will depend on number of queries as well.
Try leverage executing multiple statements in one query. This way you will make one call and then you can add the logic to identify. Check here

Cosmos DB - Deleting a document

How can I delete an individual record from Cosmos DB?
I can select using SQL syntax:
SELECT *
FROM collection1
WHERE (collection1._ts > 0)
And sure enough all documents (analogous to rows?) are returned
However this doesn't work when I attempt to delete
DELETE
FROM collection1
WHERE (collection1._ts > 0)
How do I achieve that?
The DocumentDB API's SQL is specifically for querying. That is, it only provides SELECT, not UPDATE or DELETE.
Those operations are fully supported, but require REST (or SDK) calls. For example, with .net, you'd call DeleteDocumentAsync() or ReplaceDocumentAsync(), and in node.js, this would be a call to deleteDocument() or replaceDocument().
In your particular scenario, you could run your SELECT to identify documents for deletion, then make "delete" calls, one per document (or, for efficiency and transactionality, pass an array of documents to delete, into a stored procedure).
The easiest way is probably by using Azure Storage Explorer. After connecting you can drill down to a container of choice, select a document and then delete it. You can find additional tools for Cosmos DB on https://gotcosmos.com/tools.
Another option to consider is the time to live (TTL). You can turn this on for a collection and then set an expiration for the documents. The documents will be cleaned up automatically for you as they expire.
Create a stored procedure with the following code:
/**
* A Cosmos DB stored procedure that bulk deletes documents for a given query.
* Note: You may need to execute this stored procedure multiple times (depending whether the stored procedure is able to delete every document within the execution timeout limit).
*
* #function
* #param {string} query - A query that provides the documents to be deleted (e.g. "SELECT c._self FROM c WHERE c.founded_year = 2008"). Note: For best performance, reduce the # of properties returned per document in the query to only what's required (e.g. prefer SELECT c._self over SELECT * )
* #returns {Object.<number, boolean>} Returns an object with the two properties:
* deleted - contains a count of documents deleted
* continuation - a boolean whether you should execute the stored procedure again (true if there are more documents to delete; false otherwise).
*/
function bulkDeleteStoredProcedure(query) {
var collection = getContext().getCollection();
var collectionLink = collection.getSelfLink();
var response = getContext().getResponse();
var responseBody = {
deleted: 0,
continuation: true
};
// Validate input.
if (!query) throw new Error("The query is undefined or null.");
tryQueryAndDelete();
// Recursively runs the query w/ support for continuation tokens.
// Calls tryDelete(documents) as soon as the query returns documents.
function tryQueryAndDelete(continuation) {
var requestOptions = {continuation: continuation};
var isAccepted = collection.queryDocuments(collectionLink, query, requestOptions, function (err, retrievedDocs, responseOptions) {
if (err) throw err;
if (retrievedDocs.length > 0) {
// Begin deleting documents as soon as documents are returned form the query results.
// tryDelete() resumes querying after deleting; no need to page through continuation tokens.
// - this is to prioritize writes over reads given timeout constraints.
tryDelete(retrievedDocs);
} else if (responseOptions.continuation) {
// Else if the query came back empty, but with a continuation token; repeat the query w/ the token.
tryQueryAndDelete(responseOptions.continuation);
} else {
// Else if there are no more documents and no continuation token - we are finished deleting documents.
responseBody.continuation = false;
response.setBody(responseBody);
}
});
// If we hit execution bounds - return continuation: true.
if (!isAccepted) {
response.setBody(responseBody);
}
}
// Recursively deletes documents passed in as an array argument.
// Attempts to query for more on empty array.
function tryDelete(documents) {
if (documents.length > 0) {
// Delete the first document in the array.
var isAccepted = collection.deleteDocument(documents[0]._self, {}, function (err, responseOptions) {
if (err) throw err;
responseBody.deleted++;
documents.shift();
// Delete the next document in the array.
tryDelete(documents);
});
// If we hit execution bounds - return continuation: true.
if (!isAccepted) {
response.setBody(responseBody);
}
} else {
// If the document array is empty, query for more documents.
tryQueryAndDelete();
}
}
}
And execute it using your partition key (example: null) and a query to select the documents (example: SELECT c._self FROM c to delete all).
Based on Delete Documents from CosmosDB based on condition through Query Explorer
Here is an example of how to use bulkDeleteStoredProcedure using .net Cosmos SDK V3.
ContinuationFlag has to be used because of the execution bounds.
private async Task<int> ExecuteSpBulkDelete(string query, string partitionKey)
{
var continuationFlag = true;
var totalDeleted = 0;
while (continuationFlag)
{
StoredProcedureExecuteResponse<BulkDeleteResponse> result = await _container.Scripts.ExecuteStoredProcedureAsync<BulkDeleteResponse>(
"spBulkDelete", // your sproc name
new PartitionKey(partitionKey), // pk value
new[] { sql });
var response = result.Resource;
continuationFlag = response.Continuation;
var deleted = response.Deleted;
totalDeleted += deleted;
Console.WriteLine($"Deleted {deleted} documents ({totalDeleted} total, more: {continuationFlag}, used {result.RequestCharge}RUs)");
}
return totalDeleted;
}
and response model:
public class BulkDeleteResponse
{
[JsonProperty("deleted")]
public int Deleted { get; set; }
[JsonProperty("continuation")]
public bool Continuation { get; set; }
}

Limit number of mongodb insertions

Is there a way to set a fixed limit on the number of documents that can be inserted via a bulk insert in mongodb using the node.js client?
I am inserting a number of documents into a collection that has a unique index on fieldA via a bulk insert. Some of the inserts will fail due to fieldA being non-unique, so I can't know how many will be inserted beforehand, but I want to limit the nInserted so that the total of these documents never goes over 5000.
All I can think to do is to run the full insert and if nInserted brings the total above 5000 I remove the n last inserted documents such that the total is 5000 but this seems a bit silly.
The ordered bulk insert is almost right but I don't want it to stop on the first index conflict but keep going if there is still room (ie < 5000 total).
Here's an example of what I'm trying to achieve:
db.myCol.count({foo: val}, function(err, count) {
var remaining = 5000 - count;
if (remaining > 0) {
var bulk = db.myCol.initializeUnorderedBulkOp();
toInsert.forEach(function(item) {
bulk.insert(item);
});
// make sure no more than remaining is inserted
bulk.execute(function(err, result) {
// currently, I would just insert all and
// then remove the overflow with another db action
// if nInserted + count > 5000
});
}
});
Currently there is no way to tell the Bulk API to stop inserting any records once the limit of successful inserts has been reached.
One way of doing it in the client side,
Feed the Bulk API at most n(5000 in this case) documents at a
time
If any error has occurred during the insert, Bulk insert the
remaining.
Do it recursively.
You can further add logic to process only remaining number of
records if remaining < max.
Modified code:
var toInsert = [..]; // documents to be inserted.
var max = 5000; // max records for Bulk insert.
function execBulk(start,end){
db.myCol.count({foo: 'bar'}, function(err, count) {
var remaining = total - count;
if (remaining > 0 && toInsert.length > start) {
var bulk = db.myCol.initializeUnorderedBulkOp();
toInsert.slice(start,end).forEach(function(item) {
bulk.insert(item);
});
// insert the records
bulk.execute(function(err, result) {
if(err){
console.log(err);
// insert next set of at most 5000 records.
execBulk(end,end+max-1)
}
else
{
console.log(results);
}
});
}
});
}
Invoking the function:
execBulk(0,max);

Does mongoose store all objects returned from query in memory?

If I query using mongoose and the result set coming back is 1 million records, are all those records stored in memory?
Can I iterate the results such that a cursor is used? Is this done automagically? If so is there a certain way I need to iterate the results?
Reports.find({}, function(err, reports) {
// Are all million reports stored in memory?
reports.forEach(function(report) {
// iterate through reports
});
// another way to iterate
for (var i = 0; i < reports.length; i++) {
var report = reports[i];
}
});

Creating incrementing numbers with mongoDB

We have an order system where every order has an id. For accounting purposes we need a way to generate invoices with incremening numbers. What is the best way to do this without using an sql database?
We are using node to implement the application.
http://www.mongodb.org/display/DOCS/How+to+Make+an+Auto+Incrementing+Field
The first approach is keeping counters in a side document:
One can keep a counter of the current _id in a side document, in a
collection dedicated to counters. Then use FindAndModify to atomically
obtain an id and increment the counter.
The other approach is to loop optimistically and handle dup key error code of 11000 by continuing and incrementing the id for the edge case of collisions. That works well unless there's high concurrency writes to a specific collection.
One can do it with an optimistic concurrency "insert if not present"
loop.
But be aware of the warning on that page:
Generally in MongoDB, one does not use an auto-increment pattern for
_id's (or other fields), as this does not scale up well on large database clusters. Instead one typically uses Object IDs.
Other things to consider:
Timestamp - unique long but not incrementing (base on epoch)
Hybrid Approach - apps don't necessarily have to pick one storage option.
Come up with your id mechanism based on things like customer, date/time parts etc... that you generate and handle collisions for. Depending on the scheme, collisions can be much less likely. Not necessarily incrementing but is unique and has a well defined readable pattern.
I did not find any working solution, so I implemented the "optimistic loop" in node.js to get Auto-Incrementing Interger ID fields. Uses the async module to realize the while loop.
// Insert the document to the targetCollection. Use auto-incremented integer IDs instead of UIDs.
function insertDocument(targetCollection, document, callback) {
var keepRunning = true;
var seq = 1;
// $type 16/18: Integer Values
var isNumericQuery = {$or : [{"_id" : { $type : 16 }}, {"_id" : { $type : 18 }}]};
async.whilst(testFunction, mainFunction, afterFinishFunction);
// Called before each execution of mainFunction(). Works like the stop criteria of a while function.
function testFunction() {
return keepRunning;
}
// Called each time the testFunction() passes. It is passed a function (next) which must be called after it has completed.
function mainFunction(next) {
findCursor(targetCollection, findCursorCallback, isNumericQuery, { _id: 1 });
function findCursorCallback(cursor) {
cursor.sort( { _id: -1 } ).limit(1);
cursor.each(cursorEachCallback);
}
function cursorEachCallback(err, doc) {
if (err) console.error("ERROR: " + err);
if (doc != null) {
seq = doc._id + 1;
document._id = seq;
targetCollection.insert(document, insertCallback);
}
if (seq === 1) {
document._id = 1;
targetCollection.insert(document, insertCallback);
}
}
function insertCallback(err, result) {
if (err) {
console.dir(err);
}
else {
keepRunning = false;
}
next();
}
}
// Called once after the testFunction() fails and the loop has ended.
function afterFinishFunction(err) {
callback(err, null);
}
}
// Call find() with optional query and projection criteria and return the cursor object.
function findCursor(collection, callback, optQueryObject, optProjectionObject) {
if (optProjectionObject === undefined) {
optProjectionObject = {};
}
var cursor = collection.find(optQueryObject, optProjectionObject);
callback(cursor);
}
Call with
insertDocument(db.collection(collectionName), documentToSave, function() {if(err) console.error(err);});

Resources