insertMany does not work properly while streaming in mongoDB - node.js

I'm using Node.js, mongoose.js and mongoDB in M0 Sandbox (General) atlas tier. I want to use streaming for better optimization of resources usages. here's my code:
let counter = 0;
let batchCounter = 0;
const batchSize = 500;
const tagFollowToBeSaved = [];
let startTime;
USER.find({}, { _id: 1 }).cursor()
.on('data', (id) => {
if (counter === 0) {
startTime = process.hrtime();
}
if (counter === batchSize) {
counter = 0;
const [s2] = process.hrtime();
console.log('finding ', batchSize, ' data took: ', s2 - startTime[0]);
TAG_FOLLOW.insertMany(tagFollowToBeSaved, { ordered: false })
.then((ok) => {
batchCounter += 1;
const [s3] = process.hrtime();
console.log(`batch ${batchCounter} took: ${s3 - s2}`);
})
.catch((error) => {
console.log('error in inserting following: ', error);
});
// deleting array
tagFollowToBeSaved.splice(0, tagFollowToBeSaved.length);
}
else {
counter += 1;
tagFollowToBeSaved.push({
tagId: ObjectId('5e81ba5a5c86d7215cdc9c88'),
followedBy: id,
});
}
})
.on('error', (error) => {
console.log('error while streaming: ', error);
})
.on('end', () => {
console.log('END');
});
What I'm doing:
first I read _ids from USER collection of 200k users using streams API with cursor
check whether the number of reads _ids has reached the batch size limit, if so, I'm going to insert these data to TAG_FOLLOW model.
All the console.logs are for analysis purposes and you can ignore them.
However insertMany does not work properly. because I'm monitoring database while inserting data, It seems it inserts about 100 data instead of 500 per batch size without throwing any error, therefore I'm losing 400 data per batch which is so unacceptable!
Although I changed the batchSize value, it does not work anyway and I'm going to lose some data per batch.
So what is the problem? what is the better approach for saving a bunch of large data in an optimized and ensured way?
Thanks in advance.

Related

How to bulk insert in psql using knex.js?

I've searched a lot and this is deprecated question.
I'm trying to bulk insert in a table.
My approach was like this
knex('test_table').where({
user: 'user#example.com',
})
.then(result => {
knex.transaction(trx => {
Bluebird.map(result, data => {
return trx('main_table')
.insert(data.insert_row)
}, { concurrency: 3 })
.then(trx.commit);
})
.then(() => {
console.log("done bulk insert")
})
.catch(err => console.error('bulk insert error: ', err))
})
this could work if the columns where text or numeric columns, but i have jsonb columns
But I got this error:
invalid input syntax for type json
How can I solve this problem?
Sounds like some json columns doesn't have data stringified when sent to DB.
Also that is pretty much the slowest way to insert multiple rows, because you are doing 1 query for each inserted row and using single connection for inserting.
That concurrency 3 only causes pg driver to buffer those 2 queries before they are sent to the DB through the same transaction that all the others.
Something like this should be pretty efficient (didn't test running the code, so there might be errors):
const rows = await knex('test_table').where({ user: 'user#example.com' });
rows.forEach(row => {
// make sure that json columns are actually json strings
row.someColumnWithJson = JSON.stringify(row.someColumnWithJson);
});
await knex.transaction(async trx => {
let i, j, temparray, chunk = 200;
// insert rows in 200 row batches
for (i = 0, j = rows.length; i < j; i += chunk) {
rowsToInsert = rows.slice(i, i + chunk);
await trx('main_table').insert(rowsToInsert);
}
});
Also knex.batchInsert might work for you.

BatchWrite in AWS dynamo db skipping some items

I am trying to write Items to AWS dynamo db using node SDK. The problem I am facing is that when I write batch items to AWS in parallel using threads, some of the items are not written to database. The number of items are written are random. For instance, If I run my code 3 times, at one time it would be 150, next it would 200 and third time it could be 135. In addition, when I write the items sequentially without threads, even then some of the items are not written.However, in this case the items are less missing. For instance if the total number of items is 300 then the items written are 298. I investigated the problem to see if there any unprocessed items but the batchWrite method returns nothing. It means that all the items are being processed correctly. Please note that I have OnDemand provision for my respective database so I do not expect any throttling issues. So here is my code.
exports.run = async function() {
**This is the function which runs first !!!!!**
const data = await getArrayOfObjects();
console.log("TOTAL PRICE CHANGES")
console.log(data.length)
const batchesOfData = makeBatches(data)
const threads = new Set();
console.log("**********")
console.log(batchesOfData.length)
console.log("**********")
for(let i = 0; i < batchesOfData.length; i++) {
console.log("BATCH!!!!!")
console.log(i)
console.log(batchesOfData[i].length)
// Sequential Approach
const response = await compensationHelper.createItems(batchesOfData[i])
console.log("RESPONSE")
console.log(response)
Parallel approach
// const workerResult = await runService(batchesOfData[i])
// console.log("WORKER RESUULT!!!!")
// console.log(workerResult);
}
}
exports.updateItemsInBatch = async function(data, tableName) {
console.log("WRITING DATA")
console.log(data.length)
const batchItems = {
RequestItems: {},
};
batchItems.RequestItems[tableName] = data;
try {
const result = await documentClient.batchWrite(batchItems).promise();
console.log("UNPROCESSED ITEMS")
console.log(result)
if (result instanceof Error) {
console.log(`[Error]: ${JSON.stringify(Error)}`);
throw new Error(result);
}
return Promise.resolve(true);
} catch (err) {
console.error(`[Error]: ${JSON.stringify(err.message)}`);
return Promise.reject(new Error(err));
}
};
exports.convertToAWSCompatibleFormat = function(data) {
const awsCompatibleData = [];
data.forEach(record => awsCompatibleData.push({ PutRequest: { Item: record } }));
return awsCompatibleData;
};
const createItems = async function(itemList) {
try {
const objectsList = [];
for (let index = 0; index < itemList.length; index++) {
try {
const itemListObj = itemList[index];
const ObjToBeInserted = {
// some data assignments here
};
objectsList.push(ObjToBeInserted);
if (
objectsList.length >= AWS_BATCH_SIZE ||
index === itemList.length - 1
) {
const awsCompatiableFormat = convertToAWSCompatibleFormat(
objectsList
);
await updateItemsInBatch(
awsCompatiableFormat,
process.env.myTableName
);
}
} catch (error) {
console.log(`[Error]: ${JSON.stringify(error)}`);
}
}
return Promise.resolve(true);
} catch (err) {
return Promise.reject(new Error(err));
}
};
const makeBatches = products => {
const productBatches = [];
let countr = -1;
for (let index = 0; index < products.length; index++) {
if (index % AWS_BATCH_SIZE === 0) {
countr++;
productBatches[countr] = [];
if (countr === MAX_BATCHES) {
break;
}
}
try {
productBatches[countr].push(products[index]);
} catch (error) {
continue;
}
}
return productBatches;
};
async function runService(workerData) {
return new Promise((resolve, reject) => {
const worker = new Worker(path.join(__dirname, './worker.js'), { workerData });
worker.on('message', resolve);
worker.on('error', reject);
worker.on('exit', (code) => {
if (code !== 0)
reject(new Error(`Worker stopped with exit code ${code}`));
})
})
}
// My worker file
'use strict';
const { workerData, parentPort } = require('worker_threads')
const creatItems = require('myscripts')
// You can do any heavy stuff here, in a synchronous way
// without blocking the "main thread"
console.log("I AM A NEW THREAD")
createItems(workerData)
// console.log('Going to write tons of content on file '+workerData);
parentPort.postMessage({ fileName: workerData, status: 'Done' })
From boto3 documentation:
If one or more of the following is true, DynamoDB rejects the entire batch write operation:
One or more tables specified in the BatchWriteItem request does not exist.
Primary key attributes specified on an item in the request do not match those in the corresponding table's primary key schema.
You try to perform multiple operations on the same item in the same BatchWriteItem request. For example, you cannot put and delete the same item in the same BatchWriteItem request.
Your request contains at least two items with identical hash and range keys (which essentially is two put operations).
There are more than 25 requests in the batch.
Any individual item in a batch exceeds 400 KB.
The total request size exceeds 16 MB.
To me, it looks some of this is true. At my job, we also had a problem that one batch contained 2 identical primary and secondary keys in the batch so the whole batch was discarded. I know it's not node.js, but we used this to overcome that problem.
It is batch_writer(overwrite_by_pkeys) and it is used to overwrite the last occurance of the same primary and last key in the batch. If only a small portion of your data is duplicate data and you do not need to save it, you can use this. BUT if you need to save all your data, I do not advise you to use this functionality.
I don't see where you are checking the response for UnprocessedItems. Batch operations will often return a list of items it didn't process. As is documented, BatchWriteItem "can write up to 16 MB of data, which can comprise as many as 25 put or delete requests."
I had duplicate keys issue which means that primary and the sort key had duplicate values in the batch, however, in my case this error was not returned from the AWS BatchWrite method if my timestamp was in fraction of seconds 2020-02-09T08:02:36.71, which was a bit surprising. I resolved the issue by making my createdAt(sort key) to be more granular like this => 2020-02-09T08:02:36.7187 Thus making it non-repetitive.

How to bulk insert in Mongoose cursor.eachAsync?

I have tried several solutions to get this working but all failed. I am reading the Mongo DB docs using cursor.eachAsync() and converting some doc fields. I need to move these docs to another collection after conversion.My idea is that after 1000 docs are processed, they should be bulk-inserted into the destination collection. This works good until the last batch of records which are less than 1000. To phrase the same problem differently, if the number of records are <1000 then they are not inserted.
1. First version - bulk insert after async()
Just like any other code, I should have docs < 1000 in bulk object after async() and should be able to insert. But I find bulk.length is 0. (I have removed those statements in the code snippet below).
```js`async function run() {
await mongoose.connect(dbPath, dbOptions);
const cursor = events.streamEvents(query, 10);
let successCounter = 0;
let bulkBatchSize = 1000;
let bulkSizeCounter = 0;
let sourceDocsCount = 80;
var bulk = eventsConvertedModel.collection.initializeOrderedBulkOp();
await cursor.eachAsync((doc) => {
let pPan = new Promise((resolve, reject) => {
getTokenSwap(doc.panTokenIdentifier, doc._id)
.then((swap) => {
resolve(swap);
});
});
let pXml = new Promise((resolve, reject) => {
let xmlObject;
getXmlObject(doc)
.then(getXmlObjectToken)
.then((newXmlString) => {
resolve(newXmlString);
})
.catch(errFromPromise1 => {
});
})
.catch(error => {
reject(error);
});
});
Promise.all([pPan, pXml])
.then(([panSwap, xml]) => {
doc.panTokenIdentifier = panSwap;
doc.eventRecordTokenText = xml;
return doc;
})
.then((newDoc) => {
successCounter++;
bulkSizeCounter++;
bulk.insert(newDoc);
if (bulkSizeCounter % bulkBatchSize == 0) {
bulk.execute()
.then(result => {
bulkSizeCounter = 0;
let msg = "Conversion- bulk insert =" + result.nInserted;
console.log(msg);
bulk = eventsConvertedModel.collection.initializeOrderedBulkOp();
Promise.resolve();
})
.catch(bulkErr => {
logger.error(bulkErr);
});
}
else {
Promise.resolve();
}
})
.catch(err => {
console.log(err);
});
});
console.log("outside-async=" + bulk.length); // always 0
console.log("run()- Docs converted in this run =" + successCounter);
process.exit(0);
}`
2. Second version (track expected number of iterations and after all iterations, change batch size to say 10).
Result - The batch size value changes but it's not reflected in bulk.insert. The records are lost.
3. Same as 2nd but insert one record at a time after bulk inserts are done.
```js
let d = eventsConvertedModel(newDoc);
d.isNew = true;
d._id = mongoose.Types.ObjectId();
d.save().then(saved => {
console.log(saved._id)
Promise.resolve();
}).catch(saveFailed => {
console.log(saveFailed);
Promise.resolve();
});
```
Result - I was getting DocumentNotFound error, so I added d.isNew = true. But for some reason only few records get inserted and many of them get lost.
I have also tried other variations using the number of expected bulk insert iterations. Finally, I changed the code to write to file (one doc at a time) but I am still wondering if there is any way to make write to DB make work.
Dependencies:
Node v8.0.0
Mongoose 5.2.2

How can I use a cursor.forEach() in MongoDB using Node.js?

I have a huge collection of documents in my DB and I'm wondering how can I run through all the documents and update them, each document with a different value.
The answer depends on the driver you're using. All MongoDB drivers I know have cursor.forEach() implemented one way or another.
Here are some examples:
node-mongodb-native
collection.find(query).forEach(function(doc) {
// handle
}, function(err) {
// done or error
});
mongojs
db.collection.find(query).forEach(function(err, doc) {
// handle
});
monk
collection.find(query, { stream: true })
.each(function(doc){
// handle doc
})
.error(function(err){
// handle error
})
.success(function(){
// final callback
});
mongoose
collection.find(query).stream()
.on('data', function(doc){
// handle doc
})
.on('error', function(err){
// handle error
})
.on('end', function(){
// final callback
});
Updating documents inside of .forEach callback
The only problem with updating documents inside of .forEach callback is that you have no idea when all documents are updated.
To solve this problem you should use some asynchronous control flow solution. Here are some options:
async
promises (when.js, bluebird)
Here is an example of using async, using its queue feature:
var q = async.queue(function (doc, callback) {
// code for your update
collection.update({
_id: doc._id
}, {
$set: {hi: 'there'}
}, {
w: 1
}, callback);
}, Infinity);
var cursor = collection.find(query);
cursor.each(function(err, doc) {
if (err) throw err;
if (doc) q.push(doc); // dispatching doc to async.queue
});
q.drain = function() {
if (cursor.isClosed()) {
console.log('all items have been processed');
db.close();
}
}
Using the mongodb driver, and modern NodeJS with async/await, a good solution is to use next():
const collection = db.collection('things')
const cursor = collection.find({
bla: 42 // find all things where bla is 42
});
let document;
while ((document = await cursor.next())) {
await collection.findOneAndUpdate({
_id: document._id
}, {
$set: {
blu: 43
}
});
}
This results in only one document at a time being required in memory, as opposed to e.g. the accepted answer, where many documents get sucked into memory, before processing of the documents starts. In cases of "huge collections" (as per the question) this may be important.
If documents are large, this can be improved further by using a projection, so that only those fields of documents that are required are fetched from the database.
var MongoClient = require('mongodb').MongoClient,
assert = require('assert');
MongoClient.connect('mongodb://localhost:27017/crunchbase', function(err, db) {
assert.equal(err, null);
console.log("Successfully connected to MongoDB.");
var query = {
"category_code": "biotech"
};
db.collection('companies').find(query).toArray(function(err, docs) {
assert.equal(err, null);
assert.notEqual(docs.length, 0);
docs.forEach(function(doc) {
console.log(doc.name + " is a " + doc.category_code + " company.");
});
db.close();
});
});
Notice that the call .toArray is making the application to fetch the entire dataset.
var MongoClient = require('mongodb').MongoClient,
assert = require('assert');
MongoClient.connect('mongodb://localhost:27017/crunchbase', function(err, db) {
assert.equal(err, null);
console.log("Successfully connected to MongoDB.");
var query = {
"category_code": "biotech"
};
var cursor = db.collection('companies').find(query);
function(doc) {
cursor.forEach(
console.log(doc.name + " is a " + doc.category_code + " company.");
},
function(err) {
assert.equal(err, null);
return db.close();
}
);
});
Notice that the cursor returned by the find() is assigned to var cursor. With this approach, instead of fetching all data in memory and consuming data at once, we're streaming the data to our application. find() can create a cursor immediately because it doesn't actually make a request to the database until we try to use some of the documents it will provide. The point of cursor is to describe our query. The 2nd parameter to cursor.forEach shows what to do when the driver gets exhausted or an error occurs.
In the initial version of the above code, it was toArray() which forced the database call. It meant we needed ALL the documents and wanted them to be in an array.
Also, MongoDB returns data in batch format. The image below shows, requests from cursors (from application) to MongoDB
forEach is better than toArray because we can process documents as they come in until we reach the end. Contrast it with toArray - where we wait for ALL the documents to be retrieved and the entire array is built. This means we're not getting any advantage from the fact that the driver and the database system are working together to batch results to your application. Batching is meant to provide efficiency in terms of memory overhead and the execution time. Take advantage of it, if you can in your application.
None of the previous answers mentions batching the updates. That makes them extremely slow 🐌 - tens or hundreds of times slower than a solution using bulkWrite.
Let's say you want to double the value of a field in each document. Here's how to do that fast 💨 and with fixed memory consumption:
// Double the value of the 'foo' field in all documents
let bulkWrites = [];
const bulkDocumentsSize = 100; // how many documents to write at once
let i = 0;
db.collection.find({ ... }).forEach(doc => {
i++;
// Update the document...
doc.foo = doc.foo * 2;
// Add the update to an array of bulk operations to execute later
bulkWrites.push({
replaceOne: {
filter: { _id: doc._id },
replacement: doc,
},
});
// Update the documents and log progress every `bulkDocumentsSize` documents
if (i % bulkDocumentsSize === 0) {
db.collection.bulkWrite(bulkWrites);
bulkWrites = [];
print(`Updated ${i} documents`);
}
});
// Flush the last <100 bulk writes
db.collection.bulkWrite(bulkWrites);
And here is an example of using a Mongoose cursor async with promises:
new Promise(function (resolve, reject) {
collection.find(query).cursor()
.on('data', function(doc) {
// ...
})
.on('error', reject)
.on('end', resolve);
})
.then(function () {
// ...
});
Reference:
Mongoose cursors
Streams and promises
Leonid's answer is great, but I want to reinforce the importance of using async/promises and to give a different solution with a promises example.
The simplest solution to this problem is to loop forEach document and call an update. Usually, you don't need close the db connection after each request, but if you do need to close the connection, be careful. You must just close it if you are sure that all updates have finished executing.
A common mistake here is to call db.close() after all updates are dispatched without knowing if they have completed. If you do that, you'll get errors.
Wrong implementation:
collection.find(query).each(function(err, doc) {
if (err) throw err;
if (doc) {
collection.update(query, update, function(err, updated) {
// handle
});
}
else {
db.close(); // if there is any pending update, it will throw an error there
}
});
However, as db.close() is also an async operation (its signature have a callback option) you may be lucky and this code can finish without errors. It may work only when you need to update just a few docs in a small collection (so, don't try).
Correct solution:
As a solution with async was already proposed by Leonid, below follows a solution using Q promises.
var Q = require('q');
var client = require('mongodb').MongoClient;
var url = 'mongodb://localhost:27017/test';
client.connect(url, function(err, db) {
if (err) throw err;
var promises = [];
var query = {}; // select all docs
var collection = db.collection('demo');
var cursor = collection.find(query);
// read all docs
cursor.each(function(err, doc) {
if (err) throw err;
if (doc) {
// create a promise to update the doc
var query = doc;
var update = { $set: {hi: 'there'} };
var promise =
Q.npost(collection, 'update', [query, update])
.then(function(updated){
console.log('Updated: ' + updated);
});
promises.push(promise);
} else {
// close the connection after executing all promises
Q.all(promises)
.then(function() {
if (cursor.isClosed()) {
console.log('all items have been processed');
db.close();
}
})
.fail(console.error);
}
});
});
The node-mongodb-native now supports a endCallback parameter to cursor.forEach as for one to handle the event AFTER the whole iteration, refer to the official document for details http://mongodb.github.io/node-mongodb-native/2.2/api/Cursor.html#forEach.
Also note that .each is deprecated in the nodejs native driver now.
You can now use (in an async function, of course):
for await (let doc of collection.find(query)) {
await updateDoc(doc);
}
// all done
which nicely serializes all updates.
let's assume that we have the below MongoDB data in place.
Database name: users
Collection name: jobs
===========================
Documents
{ "_id" : ObjectId("1"), "job" : "Security", "name" : "Jack", "age" : 35 }
{ "_id" : ObjectId("2"), "job" : "Development", "name" : "Tito" }
{ "_id" : ObjectId("3"), "job" : "Design", "name" : "Ben", "age" : 45}
{ "_id" : ObjectId("4"), "job" : "Programming", "name" : "John", "age" : 25 }
{ "_id" : ObjectId("5"), "job" : "IT", "name" : "ricko", "age" : 45 }
==========================
This code:
var MongoClient = require('mongodb').MongoClient;
var dbURL = 'mongodb://localhost/users';
MongoClient.connect(dbURL, (err, db) => {
if (err) {
throw err;
} else {
console.log('Connection successful');
var dataBase = db.db();
// loop forEach
dataBase.collection('jobs').find().forEach(function(myDoc){
console.log('There is a job called :'+ myDoc.job +'in Database')})
});
I looked for a solution with good performance and I end up creating a mix of what I found which I think works good:
/**
* This method will read the documents from the cursor in batches and invoke the callback
* for each batch in parallel.
* IT IS VERY RECOMMENDED TO CREATE THE CURSOR TO AN OPTION OF BATCH SIZE THAT WILL MATCH
* THE VALUE OF batchSize. This way the performance benefits are maxed out since
* the mongo instance will send into our process memory the same number of documents
* that we handle in concurrent each time, so no memory space is wasted
* and also the memory usage is limited.
*
* Example of usage:
* const cursor = await collection.aggregate([
{...}, ...],
{
cursor: {batchSize: BATCH_SIZE} // Limiting memory use
});
DbUtil.concurrentCursorBatchProcessing(cursor, BATCH_SIZE, async (doc) => ...)
* #param cursor - A cursor to batch process on.
* We can get this from our collection.js API by either using aggregateCursor/findCursor
* #param batchSize - The batch size, should match the batchSize of the cursor option.
* #param callback - Callback that should be async, will be called in parallel for each batch.
* #return {Promise<void>}
*/
static async concurrentCursorBatchProcessing(cursor, batchSize, callback) {
let doc;
const docsBatch = [];
while ((doc = await cursor.next())) {
docsBatch.push(doc);
if (docsBatch.length >= batchSize) {
await PromiseUtils.concurrentPromiseAll(docsBatch, async (currDoc) => {
return callback(currDoc);
});
// Emptying the batch array
docsBatch.splice(0, docsBatch.length);
}
}
// Checking if there is a last batch remaining since it was small than batchSize
if (docsBatch.length > 0) {
await PromiseUtils.concurrentPromiseAll(docsBatch, async (currDoc) => {
return callback(currDoc);
});
}
}
An example of usage for reading many big documents and updating them:
const cursor = await collection.aggregate([
{
...
}
], {
cursor: {batchSize: BATCH_SIZE}, // Limiting memory use
allowDiskUse: true
});
const bulkUpdates = [];
await DbUtil.concurrentCursorBatchProcessing(cursor, BATCH_SIZE, async (doc: any) => {
const update: any = {
updateOne: {
filter: {
...
},
update: {
...
}
}
};
bulkUpdates.push(update);
// Updating if we read too many docs to clear space in memory
await this.bulkWriteIfNeeded(bulkUpdates, collection);
});
// Making sure we updated everything
await this.bulkWriteIfNeeded(bulkUpdates, collection, true);
...
private async bulkWriteParametersIfNeeded(
bulkUpdates: any[], collection: any,
forceUpdate = false, flushBatchSize) {
if (bulkUpdates.length >= flushBatchSize || forceUpdate) {
// concurrentPromiseChunked is a method that loops over an array in a concurrent way using lodash.chunk and Promise.map
await PromiseUtils.concurrentPromiseChunked(bulkUpsertParameters, (upsertChunk: any) => {
return techniquesParametersCollection.bulkWrite(upsertChunk);
});
// Emptying the array
bulkUpsertParameters.splice(0, bulkUpsertParameters.length);
}
}

Mongoose (mongodb) batch insert?

Does Mongoose v3.6+ support batch inserts now? I've searched for a few minutes but anything matching this query is a couple of years old and the answer was an unequivocal no.
Edit:
For future reference, the answer is to use Model.create(). create() accepts an array as its first argument, so you can pass your documents to be inserted as an array.
See Model.create() documentation
Model.create() vs Model.collection.insert(): a faster approach
Model.create() is a bad way to do inserts if you are dealing with a very large bulk. It will be very slow. In that case you should use Model.collection.insert, which performs much better. Depending on the size of the bulk, Model.create() will even crash! Tried with a million documents, no luck. Using Model.collection.insert it took just a few seconds.
Model.collection.insert(docs, options, callback)
docs is the array of documents to be inserted;
options is an optional configuration object - see the docs
callback(err, docs) will be called after all documents get saved or an error occurs. On success, docs is the array of persisted documents.
As Mongoose's author points out here, this method will bypass any validation procedures and access the Mongo driver directly. It's a trade-off you have to make since you're handling a large amount of data, otherwise you wouldn't be able to insert it to your database at all (remember we're talking hundreds of thousands of documents here).
A simple example
var Potato = mongoose.model('Potato', PotatoSchema);
var potatoBag = [/* a humongous amount of potato objects */];
Potato.collection.insert(potatoBag, onInsert);
function onInsert(err, docs) {
if (err) {
// TODO: handle error
} else {
console.info('%d potatoes were successfully stored.', docs.length);
}
}
Update 2019-06-22: although insert() can still be used just fine, it's been deprecated in favor of insertMany(). The parameters are exactly the same, so you can just use it as a drop-in replacement and everything should work just fine (well, the return value is a bit different, but you're probably not using it anyway).
Reference
Mongo documentation
Aaron Heckman on Google Groups discussing bulk inserts
Mongoose 4.4.0 now supports bulk insert
Mongoose 4.4.0 introduces --true-- bulk insert with the model method .insertMany(). It is way faster than looping on .create() or providing it with an array.
Usage:
var rawDocuments = [/* ... */];
Book.insertMany(rawDocuments)
.then(function(mongooseDocuments) {
/* ... */
})
.catch(function(err) {
/* Error handling */
});
Or
Book.insertMany(rawDocuments, function (err, mongooseDocuments) { /* Your callback function... */ });
You can track it on:
https://github.com/Automattic/mongoose/issues/723
https://github.com/Automattic/mongoose/blob/1887e72694829b62f4e3547283783cebbe66b46b/lib/model.js#L1774
Indeed, you can use the "create" method of Mongoose, it can contain an array of documents, see this example:
Candy.create({ candy: 'jelly bean' }, { candy: 'snickers' }, function (err, jellybean, snickers) {
});
The callback function contains the inserted documents.
You do not always know how many items has to be inserted (fixed argument length like above) so you can loop through them:
var insertedDocs = [];
for (var i=1; i<arguments.length; ++i) {
insertedDocs.push(arguments[i]);
}
Update: A better solution
A better solution would to use Candy.collection.insert() instead of Candy.create() - used in the example above - because it's faster (create() is calling Model.save() on each item so it's slower).
See the Mongo documentation for more information:
http://docs.mongodb.org/manual/reference/method/db.collection.insert/
(thanks to arcseldon for pointing this out)
Here are both way of saving data with insertMany and save
1) Mongoose save array of documents with insertMany in bulk
/* write mongoose schema model and export this */
var Potato = mongoose.model('Potato', PotatoSchema);
/* write this api in routes directory */
router.post('/addDocuments', function (req, res) {
const data = [/* array of object which data need to save in db */];
Potato.insertMany(data)
.then((result) => {
console.log("result ", result);
res.status(200).json({'success': 'new documents added!', 'data': result});
})
.catch(err => {
console.error("error ", err);
res.status(400).json({err});
});
})
2) Mongoose save array of documents with .save()
These documents will save parallel.
/* write mongoose schema model and export this */
var Potato = mongoose.model('Potato', PotatoSchema);
/* write this api in routes directory */
router.post('/addDocuments', function (req, res) {
const saveData = []
const data = [/* array of object which data need to save in db */];
data.map((i) => {
console.log(i)
var potato = new Potato(data[i])
potato.save()
.then((result) => {
console.log(result)
saveData.push(result)
if (saveData.length === data.length) {
res.status(200).json({'success': 'new documents added!', 'data': saveData});
}
})
.catch((err) => {
console.error(err)
res.status(500).json({err});
})
})
})
You can perform bulk insert using mongoose, as the highest score answer.
But the example cannot work, it should be:
/* a humongous amount of potatos */
var potatoBag = [{name:'potato1'}, {name:'potato2'}];
var Potato = mongoose.model('Potato', PotatoSchema);
Potato.collection.insert(potatoBag, onInsert);
function onInsert(err, docs) {
if (err) {
// TODO: handle error
} else {
console.info('%d potatoes were successfully stored.', docs.length);
}
}
Don't use a schema instance for the bulk insert, you should use a plain map object.
It seems that using mongoose there is a limit of more than 1000 documents, when using
Potato.collection.insert(potatoBag, onInsert);
You can use:
var bulk = Model.collection.initializeOrderedBulkOp();
async.each(users, function (user, callback) {
bulk.insert(hash);
}, function (err) {
var bulkStart = Date.now();
bulk.execute(function(err, res){
if (err) console.log (" gameResult.js > err " , err);
console.log (" gameResult.js > BULK TIME " , Date.now() - bulkStart );
console.log (" gameResult.js > BULK INSERT " , res.nInserted)
});
});
But this is almost twice as fast when testing with 10000 documents:
function fastInsert(arrOfResults) {
var startTime = Date.now();
var count = 0;
var c = Math.round( arrOfResults.length / 990);
var fakeArr = [];
fakeArr.length = c;
var docsSaved = 0
async.each(fakeArr, function (item, callback) {
var sliced = arrOfResults.slice(count, count+999);
sliced.length)
count = count +999;
if(sliced.length != 0 ){
GameResultModel.collection.insert(sliced, function (err, docs) {
docsSaved += docs.ops.length
callback();
});
}else {
callback()
}
}, function (err) {
console.log (" gameResult.js > BULK INSERT AMOUNT: ", arrOfResults.length, "docsSaved " , docsSaved, " DIFF TIME:",Date.now() - startTime);
});
}
You can perform bulk insert using mongoDB shell using inserting the values in an array.
db.collection.insert([{values},{values},{values},{values}]);
Sharing working and relevant code from our project:
//documentsArray is the list of sampleCollection objects
sampleCollection.insertMany(documentsArray)
.then((res) => {
console.log("insert sampleCollection result ", res);
})
.catch(err => {
console.log("bulk insert sampleCollection error ", err);
});

Resources