How to insert millions of data into mongo with nodejs - node.js

MongoClient.connect(url, function(err, db) {
var batch = db.collection("chatmessage").initializeOrderedBulkOp();
//var batch = db.collection("chatmessage").db.collection.initializeUnorderedBulkOp()
var messageNum=0;
var chatmessage = null;
var count =0;
for (var i = 0;i<300;i++){
messageNum = getMessage();//value from 1~500000;
for(var j = 0;j<messageNum;j++){
count++;
chatmessage = generateChatMessage();
batch.insert(chatmessage);
if(count>=1000){
count=0;
batch.execute(function(err){
console.log(err);
batch = db.collection("chatmessage").initializeOrderedBulkOp();
console.log("batch execute"+util.inspect(process.memoryUsage()));
//db.close();
});
}
}
console.log("execute one chatroom"+util.inspect(process.memoryUsage()));
}
if(count>0){
batch.execute(function(err){
console.log(err);
batch = db.collection("chatmessage").initializeOrderedBulkOp();
});
}
}
need to populate millions messages into mongo with nodejs.using Bulk method to do the insert batch.
but there are some questions about the code
the bulk execute method run async.
when insert data=100,000.have not seen any bulk execute finished,until all the code finish executing,can see the "batch execute" was printed.
when the variable of messageNum is large,about 50,000.it will out of memory.
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - process out of memory
the variables are all defined external of the loop.and have run batch.execute.
don't understand why this happended.
when the record is large,the rss is rapidly increase,and never decreased.as it is not managed by V8 engine.it will increase until reaching my computer's memory size.
this is related with DB,when I remove the DB operation,there is no problem.
I guess the batch.execute() methond take this memory.but could't release it even with db.close();
{ rss: 1449750528, heapTotal: 1091999056, heapUsed: 922237384 }
-------------------------------------------UPDATE1------------------------------
Have got serveral heapdump snapshoot files with heapdump package.
the root cause is batch.execute method is async called.it never execute until all code are excuted,as i mentioned at my first question.【also doubt that even the batch.execute() is async executed.it should run indepently,not influenced by main process.but i have not found them writen into db,and the info log in callback method have not been printed】
so all documents that need to be inserted into mongo stay in the memory.and cause the issue.
#joeytwiddle have found that you have a common opionion on this problem.
from this bulk-upsert-in-mongodb
have not found the bulk.execute() method can be configured sync to execute.
anyone have any idea to solve the problem.

I was also getting this error but when I used this code Its working fine now.
Example of a simple insert Many operation using a Generator and the co module. You can also check ;
var MongoClient = require('mongodb').MongoClient,
co = require('co');
test = require('assert');
co(function*() {
var db = yield MongoClient.connect('mongodb://localhost:27017/test');
// Get the collection
var col = db.collection('insert_many_with_generators');
var r = yield col.insertMany([{a:1}, {a:2}]);
test.equal(2, r.insertedCount);
// Finish up test
db.close();
});

Related

Opening Maxmind db in Nodejs

I am trying to open maxmind opensource database in my nodejs application. My application recieves a list of ip addressses from a java application. Application then returns the latitude and longitude corresponding to each ip. I have succesfully done this synchronously, but i want to do it asynchronously to make things a little faster. I have written a code for this, but the application gets killed everytime. I am guessing that the reason might be simultaneous opening of the same database(I might be wrong :D). I am posting the code below. Please take a look at it and make some suggestions on where I am going wrong. Thanks!!!
app.post('/endPoint', function(req, res){
var obj = req.body;
var list = [];
var ipList = obj.ipList;
for(var i = 0; i<ipList.length; i++){
var ip = ipList[i];
//console.log(i);
maxmind.open('./GeoLite2-City.mmdb', function(err, cityLookup){
if(err) throw err;
console.log("open database");
var city = cityLookup.get(ip);
if(city!=null){
var cordinates = {'latitude': city.location.latitude, 'longitude': geodata.location.longitude};
//console.log(cordinates);
list.push(cordinates);
}
if(list.length == ipList.length){
res.json({finalMap: list});
}
});
}
});
You should open the database only once, and reuse it.
The easiest solution would be to synchronously open the database at the top of your file:
const maxmind = require('maxmind');
const cityLookup = maxmind.openSync('./GeoLite2-City.mmdb');
Reading it asynchronously wouldn't speed things up a whole lot, and because loading the database is done only once (during app startup), I don't think it's a big deal that it may temporarily block the event loop for a few seconds.
And use the cityLookup function in your request handler:
app.post('/endPoint', function(req, res) {
...
let city = cityLookup.get(ip);
...
});

Nodejs, not waiting for Redis query to complete before continuing with execution

Using Node.js I need to load three files dynamically with a require() function by fetching the file path from Cassandra. From each file I need to fetch data that is in Redis and do some validation before loading another file from Cassandra. The issue here is: before the validation logic executes and provides results the next file's start to get loaded in parallel. The validation result comes after the loading of the second file, which shouldn't happen. The second file loading should wait for the first file validation logic to be complete and must load only if the validation result is a success. Please help me ... How do I pause or wait for Redis to complete the query in node.js???
node.js
"use strict";
var express = require('express');
var cassandra = require('cassandra-driver');
var app = express();
var Promise = require('bluebird');
var redis = Promise.promisifyAll(require('redis'));
var redisClient = redis.createClient(6379, '127.0.0.1');
var client = new cassandra.Client({contactPoints: ['127.0.0.1'], keyspace: 'poc'});
client.execute("SELECT file FROM testqry1", function (err, result) {
if (!err){
if ( result.rows.length > 0 ) {
for(var i=0; i< result.rows.length; i++){
var filePath=result.rows[i].get('file');
var newdat=Promise.promisifyAll(require(filePath));
var res = newdat(redisClient);
console.log('res:::'+res);
if (res=='failed'){
return;
}
}
} else {
console.log("No results");
}
}
});
file1.js
var crypto = require('crypto');
var redisValue='';
module.exports = function(redisclient){
redisclient.hmgetAsync("testdata", "text1").then(function(redisValue){
console.log('value from redis::'+redisValue)
}).then(function(){
var hashedUserID = crypto.createHmac('sha256', 'sample')
.update('helloworld')
.digest('hex');
function disp(value) {
console.log('value::'+value);
}
disp(hashedUserID);
console.log('redisValue::'+redisValue);
if(hashedUserID =='e043e7e68058c8a4cd686db38f01771bd7a04b8bb9a658d3cb40d0be45935094'){
redata='true';
}else{
redata='false';
}
console.log('redata::'+redata)
})
}
file2.js & file3.js as same content
var result1='';
module.exports = function(redisclient){
redisclient.hmget("testdata", "text1" , function(err, redisValue){
console.log('redisValue2 == %s',redisValue);
if(redisValue == 'test value'){
result1 = "success";
}else{
result1="failed";
}
});
return result1;
}
Output :
res:::undefined
res:::
res:::
value from redis::test data here
value::e043e7e68058c8a4cd686db38f01771bd7a04b8bb9a658d3cb40d0be45935094
redisValue::
redata::true
redisValue2 == test data here
redisValue3 == hello world test data
You say that file2/3 are "same content" but they aren't in one critical area. Per Bluebird's documentation for promisifyAll (see http://bluebirdjs.com/docs/api/promise.promisifyall.html), this feature creates an ...Async version of each core function in the Redis client. You call hmgetAsync in your first case, but you only call hmget in your others.
This is important because you're using an async pattern but with a non-async code structure. In file2/3 you set result1 inside an async callback, but then return it below each call before the call could possibly have returned.
You have two choices:
1: You can convert file2/3/etc to a fully traditional pattern by passing in a callback in addition to the redis client:
module.exports = function(redisclient, callback){
Instead of returning result1, you would then call the callback with this value:
if(redisValue == 'test value'){
callback(null, "success");
} else {
callback("failed", null);
}
2: You could convert file2/3/..N to be Promise-based, in which case you do not need to promisifyAll(require(...)) them - you can simply require() them. Such a pattern might look like:
module.exports = function(redisclient){
return redisclient.hmgetAsync("testdata", "text1");
};
This is a much simpler and cleaner option, and if you keep going with it you can see that you could probably even eliminate the require() and simply do the hmgetAsync in file1 with appropriate data returned by Cassandra. But it's hard to know without seeing your specific application needs. In any event, Promise-based patterns are generally much shorter and cleaner, but not always better - there IS a moderate performance overhead for using them. It's your call which way you go - either will work.

Azure documentdb bulk insert using stored procedure

Hi I am using 16 collections to insert around 3-4 million json objects ranging from 5-10k per object.I am using stored procedure to insert these documents.I have 22 Capacity Unit.
function bulkImport(docs) {
var collection = getContext().getCollection();
var collectionLink = collection.getSelfLink();
// The count of imported docs, also used as current doc index.
var count = 0;
// Validate input.
if (!docs) throw new Error("The array is undefined or null.");
var docsLength = docs.length;
if (docsLength == 0) {
getContext().getResponse().setBody(0);
}
// Call the CRUD API to create a document.
tryCreateOrUpdate(docs[count], callback);
// Note that there are 2 exit conditions:
// 1) The createDocument request was not accepted.
// In this case the callback will not be called, we just call setBody and we are done.
// 2) The callback was called docs.length times.
// In this case all documents were created and we don't need to call tryCreate anymore. Just call setBody and we are done.
function tryCreateOrUpdate(doc, callback) {
var isAccepted = true;
var isFound = collection.queryDocuments(collectionLink, 'SELECT * FROM root r WHERE r.id = "' + doc.id + '"', function (err, feed, options) {
if (err) throw err;
if (!feed || !feed.length) {
isAccepted = collection.createDocument(collectionLink, doc, callback);
}
else {
// The metadata document.
var existingDoc = feed[0];
isAccepted = collection.replaceDocument(existingDoc._self, doc, callback);
}
});
// If the request was accepted, callback will be called.
// Otherwise report current count back to the client,
// which will call the script again with remaining set of docs.
// This condition will happen when this stored procedure has been running too long
// and is about to get cancelled by the server. This will allow the calling client
// to resume this batch from the point we got to before isAccepted was set to false
if (!isFound && !isAccepted) getContext().getResponse().setBody(count);
}
// This is called when collection.createDocument is done and the document has been persisted.
function callback(err, doc, options) {
if (err) throw err;
// One more document has been inserted, increment the count.
count++;
if (count >= docsLength) {
// If we have created all documents, we are done. Just set the response.
getContext().getResponse().setBody(count);
} else {
// Create next document.
tryCreateOrUpdate(docs[count], callback);
}
}
my C# codes looks like this
public async Task<int> Add(List<JobDTO> entities)
{
int currentCount = 0;
int documentCount = entities.Count;
while(currentCount < documentCount)
{
string argsJson = JsonConvert.SerializeObject(entities.Skip(currentCount).ToArray());
var args = new dynamic[] { JsonConvert.DeserializeObject<dynamic[]>(argsJson) };
// 6. execute the batch.
StoredProcedureResponse<int> scriptResult = await DocumentDBRepository.Client.ExecuteStoredProcedureAsync<int>(sproc.SelfLink, args);
// 7. Prepare for next batch.
int currentlyInserted = scriptResult.Response;
currentCount += currentlyInserted;
}
return currentCount;
}
The problem I am facing is out of 400k documents that I try to insert at times documents get missed with out giving any error.
The application is worker role deployed on cloud.
If I increase the number of threads or instances inserting in documentDB the number of documents missed are much higher.
how to figure out what is the problem.Thanks in Advance.
I found that when trying this code I would get an error at docs.length which stated that length was undefined.
function bulkImport(docs) {
var collection = getContext().getCollection();
var collectionLink = collection.getSelfLink();
// The count of imported docs, also used as current doc index.
var count = 0;
// Validate input.
if (!docs) throw new Error("The array is undefined or null.");
var docsLength = docs.length; // length is undefined
}
After many tests (could not find anything in Azure documentation) I realized that I could not pass an array as was suggested. The parameter had to be an object. I had to modify the batch code like this in order for it to run.
I also found I could not simply try and pass an array of documents in the DocumentDB script explorer (Input box) either. Even though the placeholder help text says you can.
This code worked for me:
// psuedo object for reference only
docObject = {
"items": [{doc}, {doc}, {doc}]
}
function bulkImport(docObject) {
var context = getContext();
var collection = context.getCollection();
var collectionLink = collection.getSelfLink();
var count = 0;
// Check input
if (!docObject.items || !docObject.items.length) throw new Error("invalid document input parameter or undefined.");
var docs = docObject.items;
var docsLength = docs.length;
if (docsLength == 0) {
context.getResponse().setBody(0);
}
// Call the funct to create a document.
tryCreateOrUpdate(docs[count], callback);
// Obviously I have truncated this function. The above code should help you understand what has to change.
}
Hopefully Azure documentation will catch up or become easier to find if I missed it.
I'll also be placing a bug report for the Script Explorer in hopes that the Azurites will update.
It’s important to note that stored procedures have bounded execution, in which all operations must complete within the server specified request timeout duration. If an operation does not complete with that time limit, the transaction is automatically rolled back. In order to simplify development to handle time limits, all CRUD (Create, Read, Update, and Delete) operations return a Boolean value that represents whether that operation will complete. This Boolean value can be used a signal to wrap up execution and for implementing a continuation based model to resume execution (this is illustrated in our code samples below).
The bulk-insert stored procedure provided above implements the continuation model by returning the number of documents successfully created. This is noted in the stored procedure's comments:
// If the request was accepted, callback will be called.
// Otherwise report current count back to the client,
// which will call the script again with remaining set of docs.
// This condition will happen when this stored procedure has been running too long
// and is about to get cancelled by the server. This will allow the calling client
// to resume this batch from the point we got to before isAccepted was set to false
if (!isFound && !isAccepted) getContext().getResponse().setBody(count);
If the output document count is less than the input document count, you will need to re-run the stored procedure with the remaining set of documents.
Since May 2018 there is a new Batch SDK for Cosmos DB. There is a GitHub repo to get you started.
I have been able to import 100.000 records in 9 seconds. And using Azure Batch to fan out the inserts, I have done 19 mln records in 1m15s. This was on a 1.66mln RU/s collection, which you obviously can scale down after import.

NodeJS with arangojs and sync: Everything after .sync() ignored?

I want to use NodeJS to read 60k records from a MySQL database and write them to a ArangoDB database. I will later use ArangoDB's aggregation features etc. to process my dataset.
Coming from PHP, where a script usually runs synchronous, and because I believe it makes sense here, my initial (naive) try was to make my NodeJS script run sync too. However, it doesn't work as expected:
I print to console, call a function via .sync() to connect to ArangoDB server and print all existing databases, then print to console again. But everything below the sync call to my ArangoDB function is completely ignored (does not print to console again, nor does it seem to execute anything else here).
What am I overlooking? Does .done() in the function called via .sync() cause trouble?
var mysql = require('node-mysql');
var arango = require('arangojs');
//var sync = require('node-sync'); // Wrong one!
var sync = require('sync');
function test_arango_query() {
var db = arango.Connection("http://localhost:8529");
db.database.list().done(function(res) {
console.log("Databases: %j", res);
});
return "something?";
}
sync(function() {
console.log("sync?");
var result = test_arango_query.sync();
console.log("done."); // DOES NOT PRINT, NEVER EXECUTED?!
return result;
}, function(err, result) {
if (err) console.error(err);
console.log(result);
});
Your function test_arango_query doesn't use a callback. sync only works with functions that use a callback. It needs to know when the data is ready to return it from .sync(), if your function never calls the callback, then sync can't ever return a result.
Update your function to call a callback function when you want it to return:
function test_arango_query(callback) {
var db = arango.Connection("http://localhost:8529");
db.database.list().done(function(res) {
console.log("Databases: %j", res);
callback('something');
});
}

Node ram usage keep increasing

I have a page build by node and receive 15 requests/second.
And my function is like this:
var somepage = function(req,res){
res.send(200);
call_mongo_to_save_some_data(req.somedata);
}
var call_mongo_to_save_some_data = function(data){
var needToSave = {}
needToSave.val1 = data.val1;
needToSave.val2 = data.val2;
needToSave.val3 = data.val3;
needToSave.val4 = data.val4;
needToSave.val5 = data.val5;
var db = mongoskin();
db.collection.insert(needToSave).success(function(){
db.close();
}).fail(function(err){ throw err; });
}
So you can see I do something after I send the response. To do this, is because I want to reduce the response time. So the client user won't waiting for I save something in mongo.
But after I launch the page, I found that the ram usage is keep increasing. I did some research, saying the res.write clear the output buffer. And compare with my code, I do something after the res.write (res.send). So not sure is that the reason. Or it's some other issue here.

Resources