NodeJS Out of memory, using for loop with asynchronous function - node.js

exports.updateFullCentralRecordSheet = function (req, _id, type) {
FullCentralRecordSheet.remove({_ExternalParty: _id, centralRecordType: type, centralSheetType: "Central Sheet"}, function (err) {
if (err) {
saveErrorLog(req, err);
}
let query = {"structure.externalPartyRelationships": {$elemMatch: {_ExternalParty: _id}}, disabled: {$mod: [2, 0]}, initialized: true, profitLossType: type};
let fullCentralRecordSheetObjects = [];
ProfitLossSheet.find(query).sort({profitLossDate: 1}).lean().exec(function (err, profitLossSheetObjects) {
if (err) {
saveErrorLog(req, err);
}
async.each(profitLossSheetObjects, function (profitLossSheetObject, callback) {
/// HEAVY COMPUTATION HERE
callback();
});
}, function (err) {
if (err) {
saveErrorLog(req, err);
} else {
query = {centralRecordMode: {$in: ["Payment In", "Payment Out", "Transfer", "General Out"]}, disabled: {$mod: [2, 0]}, centralRecordType: {$in: ["Split", type]}, _ExternalParty: _id, status: {$ne: "Reject"}};
CentralRecordSheet.find(query).lean().exec(function (err, centralRecordSheetObjects) {
if (err) {
saveErrorLog(req, err);
}
_.each(centralRecordSheetObjects, function (centralRecordSheetObject) {
// SOME MORE PROCESSING
});
fullCentralRecordSheetObjects = _.sortBy(fullCentralRecordSheetObjects, function (fullCentralRecordSheetObject) {
return new Date(fullCentralRecordSheetObject.centralRecordDate).getTime();
});
let runningBalance = 0;
_.each(fullCentralRecordSheetObjects, function (fullCentralRecordSheetObject) {
runningBalance = runningBalance - fullCentralRecordSheetObject.paymentIn.total + fullCentralRecordSheetObject.paymentOut.total + fullCentralRecordSheetObject.moneyIn.total - fullCentralRecordSheetObject.moneyOut.total + fullCentralRecordSheetObject.transferIn.total - fullCentralRecordSheetObject.transferOut.total;
fullCentralRecordSheetObject.balance = runningBalance;
const newFullCentralSheetRecordObject = new FullCentralRecordSheet(fullCentralRecordSheetObject);
newFullCentralSheetRecordObject.save(); // Asynchronous save
});
});
}
});
});
});
};
This is my code to process some data and save it to database. As you can see there is some computation involves in each async loop and after the loop there is final processing of data. It works fine if I pass in one _id at a time. However when I try to do the task like this
exports.refreshFullCentralRecordSheetObjects = function (req, next) {
ExternalParty.find().exec(function (err, externalPartyObjects) {
if (err) {
utils.saveErrorLog(req, err);
return next(err, null, [req.__(err.message)], []);
}
_.each(externalPartyObjects, function (externalPartyObject) {
updateFullCentralRecordSheet(req, externalPartyObject._id, "Malay");
updateFullCentralRecordSheet(req, externalPartyObject._id, "Thai");
})
return next(err, null, ["Ddd"], ["Ddd"]);
});
};
I have about 273 objects to loop through. This cause the memory fatal error. I tried to increase --max-old-space-size=16000 but it is still crashing. I used task manager to track the memory of node.exe process and it goes over 8 GB.
I am not sure why increasing memory to 16GB does not help, it is still crashing around 8GB (according to task manager). Another thing is when I try to only process 10 records instead of 273, task manager report that it is using about 500 MB. This 500 MB will not disappear unless I make another request to the server. I find this very odd because why isn't NodeJS garbage collect after it is done with processing 10 records? Those 10 records successfully processed and saved to database however the memory usage remain unchanged in task manager.
I tried using async.forEachLimit, turning my update function to be asynchronous, play around with process.nextTick() but I still have fatal error memory problem. What can I do to make sure this runs?

Another thing is when I try to only process 10 records instead of 273,
task manager report that it is using about 500 MB. This 500 MB will
not disappear unless I make another request to the server. I find this
very odd because why isn't NodeJS garbage collect after it is done
with processing 10 records? Those 10 records successfully processed
and saved to database however the memory usage remain unchanged in
task manager.
That's normal, node GC is lazy (GC is a synchronous operation, blocking the loop, so that's a good thing).
Try to paginate the query ?

Related

Make multi threaded 1 million inserts to MongoDb using Node JS scripts

I have a isolated sync server that pulls a tab limited text file from a external ftp server and updates(saves) to mongodb after processing.
My code looks like this
//this function pulls file from external ftp server
async function upsteamFile() {
try {
let pythonProcess = spawn('python3', [configVar.ftpInbound, '/outbound/Items.txt', configVar.dataFiles.items], {encoding: 'utf8'});
logger.info('FTP SERVER LOGS...' + '\n' + pythonProcess.stdout);
await readItemFile();
logger.info('The process of file is done');
process.exit();
} catch (upstreamError) {
logger.error(upstreamError);
process.exit();
}
}
//this function connects to db and calls processing function for each row in the text file.
async function readItemFile(){
try{
logger.info('Reading Items File');
let dataArray = fs.readFileSync(configVar.dataFiles.items, 'utf8').toString().split('\n');
logger.info('No of Rows Read', dataArray.length);
await dbConnect.connectToDB(configVar.db);
logger.info('Connected to Database', configVar.db);
while (dataArray.length) {
await Promise.all( dataArray.splice(0, 5000).map(async (f) => {
splitValues = f.split('|');
await processItemsFile(splitValues)
})
)
logger.info("Current batch finished processing")
}
logger.info("ALL batch finished processing")
}
catch(PromiseError){
logger.error(PromiseError)
}
}
async function processItemsFile(splitValues) {
try {
// Processing of the file is done here and I am using 'save' in moongoose to write to db
// data is cleaned and assigned to respective fields
if(!exists){
let processedValues = new Products(assignedValues);
let productDetails = await processedValues.save();
}
return;
}
catch (error) {
throw error
}
}
upstream()
So this takes about 3 hours to process 100,000 thousand rows and update it in the database.
Is there any way that I can speed this up. I am very much limited from the hardware. I am using a ec2 instance based linux server with 2 core and 4 gb ram.
Should I use worker threads like microjob to run multi-threads . if yes , then how would I go about doing it
Or is this the maximum performance?
Note : I cant do bulk update in mongodb as there is mongoose pre hooks are getting triggered on save
You can always try a bulk update with the use of updateOne method.
I would consider also using the readFileStream instead of readFileSync.
With the event-driven architecture you could push, let's say every 100k updates into array chunks and bulk update on them simultaneously.
You can trigger a pre updateOne() (instead of save()) hook during this operation.
I have solved a similar problem (updating 100k CSV rows) with the following solution:
Create a readFileStream (thanks to that, your application won't consume much heap memory in case of the huge files)
I'm using CSV-parser npm library to deconstruct a CSV file into separate rows of data:
let updates = [];
fs.createReadStream('/filePath').pipe(csv())
.on('data', row => {
// ...do anything with the data
updates.push({
updateOne: {
filter: { /* here put the query */ },
update: [ /* any data you want to update */ ],
upsert: true /* in my case I want to create record if it does not exist */
}
})
})
.on('end', async () => {
await MyCollection.bulkWrite(data)
.catch(err => {
logger.error(err);
})
updates = []; // I just clean up the huge array
})

Im trying to add elements to an array in a nested query in mongoose using Node.js

Im getting an empty array at the end of Async.waterfall not sure why but this how my code looks like:
exports.GetJobs = function(req,res){
var Jobs =[]; ///// Jobs is a global variable
async.waterfall([
function(next){
// get my alert
UserAlertDB.find({User:req.user.id},function(err,AlertResult){
next(null,AlertResult);
})
},
function(AlertResult, next) {
// You might get an error if you do not have created an alert so AlertResult[0].Words will not exist
if(AlertResult) // if Alert Result not equal to null then query by alert
{
JobDB.find({title: new RegExp(AlertResult[0].Words, 'i')}, function (err, JobResults) {
if (err) console.log(err);
// If the job matches the requirements for alert then push it to the list
JobResults.forEach(function(job){
JobOffer.find({JobID : job._id, JobOfferOwnerID: req.user.id}, function(err,Offers){
if(err) console.log("Error Inside Querying Jobs Result for Alert " + err);
if(Offers.length==0){
console.log("Jobs are : " + JSON.stringify(Jobs)) // when I print the Jobs array here it shows that a job is getting pushed into the array
Jobs.push(job);
}
})
})
next(err,Jobs) // But Jobs here is empty
})
}
else{
next("There is an error",null)
}
}
], function(err,Jobs){
console.log(JSON.stringify(Jobs)); ////// Getting Empty Jobs here
if(err) console.log("Error Inside Get Jobs Match Alert Data in Server : " + err);
res.json(Jobs); ////// Jobs here is empty
});
}
so if you notice that when I try to send the Jobs array at the end with the res.json(Jobs) has empty Jobs although I have pushed those jobs in the Jobs array.
The problem in your code is that the JobResults.forEach is synchronous and you are calling asynchronous JobOffer.find inside the forEach loop. Therefore, your program doesn't wait for asynchronous operations to finish and calling the next(err, Jobs) immediately. Instead of forEach use async.each and call next(err, Jobs) only when async.each is finished. I would also advise to make sure you are checking the err value in every callback, for example here you are passing null even though there might be error:
// get my alert
UserAlertDB.find({User:req.user.id},function(err, AlertResult) {
next(err, AlertResult);
})
Hope it helps, let me know if you need any other help with your code.

Node+MongoDB: coll.find().toArray(cb) works for collection A, but never fires cb for collection B?

UPDATE: I've narrowed this down to what appears to be a different issue, and as such have asked a separate question here.
=======
I have a mongoDB instance running on localhost with two collections, "mydocs" (which has ~12,000 documents in it) and "mydoctypes" (which has only 7 documents in it).
I have a standalone NodeJS script which gets a connection to the database and then fires off the following:
myDb.collection('mydoctypes').find().toArray(function(err, results) {
console.log("Got results.");
if (err) {
console.log("err: " + err);
} else {
console.log("Got doctypes: " + results.length);
}
});
The output of that script is:
Got results.
Got doctypes: 7
If I modify the same script to access the 'mydocs' collection instead:
myDb.collection('mydocs').find().toArray(function(err, results) {
console.log("Got results.");
if (err) {
console.log("err: " + err);
} else {
console.log("Got docs: " + results.length);
}
});
I get no output at all. The callback, apparently, never gets fired.
== UPDATE ==
So it looks like the problem was likely too many documents causing toArray() to run out of RAM.
Now, I'm using .each() to iterate, but having a different issue: each() is only running through the first batch (whatever I set batchSize to), and never loading any more documents. The code is this:
myDb.collection('mydocs').find().batchSize(50).each(function(err, item) {
if (item != null) {
process.stdout.write(".");
}
}
Indeed as seen in the comments default mongodb driver for nodejs is returning a cursor, default cursor when triggered have a ~101 documents or around 1 MB batch size, you can modify this number using the batchSize function. But in order to iterate your collection you should stream it as following:
MongoClient.connect('mongodb://localhost:27017/mydb', function(err, db) {
var cursor = db.collection('mycollection').find();
cursor.forEach(
function(doc) {
console.log(doc);
},
function(err) {
if (err) {
console.error(err);
} else {
//cursor has exausted, no more docs to iterate exit
return db.close();
}
});
});
The forEach method applied on the cursor is not the javascript default one from Arrays, it has two callbacks (the cb(doc) wich will iterate for each document, and the second one an cb(err) wich will catch the error or when the cursor was exausted.
You can use projection to lower the amount of data cursor.project({title: 1, name: 1}) and this will significantly reduce the amount of ram consumed.

how to make this function async in node.js

Here is the situation:
I am new to node.js, I have a 40MB file containing multilevel json file like:
[{},{},{}] This is an array of objects (~7000 objects). Each object has properties and a one of those properties is also an array of objects
I wrote a function to read the content of the file and iterate it. I succeeded to get what I wanted in terms of content but not usability. I thought that I wrote an async function that would allow node to serve other web requests while iterating the array but that is not the case. I would be very thankful if anyone can point me to what I've done wrong and how to rewrite it so I can have a non-blocking iteration. Here's the function that handles the situation:
function getContents(callback) {
fs.readFile(file, 'utf8', function (err, data) {
if (err) {
console.log('Error: ' + err);
return;
}
js = JSON.parse(data);
callback();
return;
});
}
getContents(iterateGlobalArr);
var count = 0;
function iterateGlobalArr() {
if (count < js.length) {
innerArr = js.nestedProp;
//iterate nutrients
innerArr.forEach(function(e, index) {
//some simple if condition here
});
var schema = {
//.....get props from forEach iteration
}
Model.create(schema, function(err, post) {
if(err) {
console.log('\ncreation error\n', err);
return;
}
if (!post) {
console.log('\nfailed to create post for schema:\n' + schema);
return;
}
});
count++;
process.nextTick(iterateGlobalArr);
}
else {
console.log("\nIteration finished");
next();
}
Just so it is clear how I've tested the above situation. I open two tabs one loading this iteration which takes some time and second with another node route which does not load until the iteration is over. So essentially I've written a blocking code but not sure how to re-factor it! I suspect that just because everything is happening in the callback I am unable to release the event loop to handle another request...
Your code is almost correct. What you are doing is inadvertently adding ALL the items to the very next tick... which still blocks.
The important piece of code is here:
Model.create(schema, function(err, post) {
if(err) {
console.log('\ncreation error\n', err);
return;
}
if (!post) {
console.log('\nfailed to create post for schema:\n' + schema);
return;
}
});
// add EVERYTHING to the very same next tick!
count++;
process.nextTick(iterateGlobalArr);
Let's say you are in tick A of the event loop when getContents() runs and count is 0. You enter iterateGlobalArr and you call Model.create. Because Model.create is async, it is returning immediately, causing process.nextTick() to add processing of item 1 to the next tick, let's say B. Then it calls iterateGlobalArr, which does the same thing, adding item 2 to the next tick, which is still B. Then item 3, and so on.
What you need to do is move the count increment and process.nextTick() into the callback of Model.create(). This will make sure the current item is processed before nextTick is invoked... which means next item is actually added to the next tick AFTER the model item has been created... which will give your app time to handle other things in between. The fixed version of iterateGlobalArr is here:
function iterateGlobalArr() {
if (count < js.length) {
innerArr = js.nestedProp;
//iterate nutrients
innerArr.forEach(function(e, index) {
//some simple if condition here
});
var schema = {
//.....get props from forEach iteration
}
Model.create(schema, function(err, post) {
// schedule our next item to be processed immediately.
count++;
process.nextTick(iterateGlobalArr);
// then move on to handling this result.
if(err) {
console.log('\ncreation error\n', err);
return;
}
if (!post) {
console.log('\nfailed to create post for schema:\n' + schema);
return;
}
});
}
else {
console.log("\nIteration finished");
next();
}
}
Note also that I would strongly suggest that you pass in your js and counter with each call to iterageGlobalArr, as it will make your iterateGlobalArr alot easier to debug, among other things, but that's another story.
Cheers!
Node is single-threaded so async will only help you if you are relying on another system/subsystem to do the work (a shell script, external database, web service etc). If you have to do the work in Node you are going to block while you do it.
It is possible to create one node process per core. This solution would result in only blocking one of the node processes and leave the rest to service your requests, but this feature is still listed as experimental http://nodejs.org/api/cluster.html.
A single instance of Node runs in a single thread. To take advantage
of multi-core systems the user will sometimes want to launch a cluster
of Node processes to handle the load.
The cluster module allows you to easily create child processes that
all share server ports.

Idiomatic way to wait for multiple callbacks in Node.js

Suppose you need to do some operations that depend on some temp file. Since
we're talking about Node here, those operations are obviously asynchronous.
What is the idiomatic way to wait for all operations to finish in order to
know when the temp file can be deleted?
Here is some code showing what I want to do:
do_something(tmp_file_name, function(err) {});
do_something_other(tmp_file_name, function(err) {});
fs.unlink(tmp_file_name);
But if I write it this way, the third call can be executed before the first two
get a chance to use the file. I need some way to guarantee that the first two
calls already finished (invoked their callbacks) before moving on without nesting
the calls (and making them synchronous in practice).
I thought about using event emitters on the callbacks and registering a counter
as receiver. The counter would receive the finished events and count how many
operations were still pending. When the last one finished, it would delete the
file. But there is the risk of a race condition and I'm not sure this is
usually how this stuff is done.
How do Node people solve this kind of problem?
Update:
Now I would advise to have a look at:
Promises
The Promise object is used for deferred and asynchronous computations.
A Promise represents an operation that hasn't completed yet, but is
expected in the future.
A popular promises library is bluebird. A would advise to have a look at why promises.
You should use promises to turn this:
fs.readFile("file.json", function (err, val) {
if (err) {
console.error("unable to read file");
}
else {
try {
val = JSON.parse(val);
console.log(val.success);
}
catch (e) {
console.error("invalid json in file");
}
}
});
Into this:
fs.readFileAsync("file.json").then(JSON.parse).then(function (val) {
console.log(val.success);
})
.catch(SyntaxError, function (e) {
console.error("invalid json in file");
})
.catch(function (e) {
console.error("unable to read file");
});
generators: For example via co.
Generator based control flow goodness for nodejs and the browser,
using promises, letting you write non-blocking code in a nice-ish way.
var co = require('co');
co(function *(){
// yield any promise
var result = yield Promise.resolve(true);
}).catch(onerror);
co(function *(){
// resolve multiple promises in parallel
var a = Promise.resolve(1);
var b = Promise.resolve(2);
var c = Promise.resolve(3);
var res = yield [a, b, c];
console.log(res);
// => [1, 2, 3]
}).catch(onerror);
// errors can be try/catched
co(function *(){
try {
yield Promise.reject(new Error('boom'));
} catch (err) {
console.error(err.message); // "boom"
}
}).catch(onerror);
function onerror(err) {
// log any uncaught errors
// co will not throw any errors you do not handle!!!
// HANDLE ALL YOUR ERRORS!!!
console.error(err.stack);
}
If I understand correctly I think you should have a look at the very good async library. You should especially have a look at the series. Just a copy from the snippets from github page:
async.series([
function(callback){
// do some stuff ...
callback(null, 'one');
},
function(callback){
// do some more stuff ...
callback(null, 'two');
},
],
// optional callback
function(err, results){
// results is now equal to ['one', 'two']
});
// an example using an object instead of an array
async.series({
one: function(callback){
setTimeout(function(){
callback(null, 1);
}, 200);
},
two: function(callback){
setTimeout(function(){
callback(null, 2);
}, 100);
},
},
function(err, results) {
// results is now equals to: {one: 1, two: 2}
});
As a plus this library can also run in the browser.
The simplest way increment an integer counter when you start an async operation and then, in the callback, decrement the counter. Depending on the complexity, the callback could check the counter for zero and then delete the file.
A little more complex would be to maintain a list of objects, and each object would have any attributes that you need to identify the operation (it could even be the function call) as well as a status code. The callbacks would set the status code to completed.
Then you would have a loop that waits (using process.nextTick) and checks to see if all tasks are completed. The advantage of this method over the counter, is that if it is possible for all outstanding tasks to complete, before all tasks are issued, the counter technique would cause you to delete the file prematurely.
// simple countdown latch
function CDL(countdown, completion) {
this.signal = function() {
if(--countdown < 1) completion();
};
}
// usage
var latch = new CDL(10, function() {
console.log("latch.signal() was called 10 times.");
});
There is no "native" solution, but there are a million flow control libraries for node. You might like Step:
Step(
function(){
do_something(tmp_file_name, this.parallel());
do_something_else(tmp_file_name, this.parallel());
},
function(err) {
if (err) throw err;
fs.unlink(tmp_file_name);
}
)
Or, as Michael suggested, counters could be a simpler solution. Take a look at this semaphore mock-up. You'd use it like this:
do_something1(file, queue('myqueue'));
do_something2(file, queue('myqueue'));
queue.done('myqueue', function(){
fs.unlink(file);
});
I'd like to offer another solution that utilizes the speed and efficiency of the programming paradigm at the very core of Node: events.
Everything you can do with Promises or modules designed to manage flow-control, like async, can be accomplished using events and a simple state-machine, which I believe offers a methodology that is, perhaps, easier to understand than other options.
For example assume you wish to sum the length of multiple files in parallel:
const EventEmitter = require('events').EventEmitter;
// simple event-driven state machine
const sm = new EventEmitter();
// running state
let context={
tasks: 0, // number of total tasks
active: 0, // number of active tasks
results: [] // task results
};
const next = (result) => { // must be called when each task chain completes
if(result) { // preserve result of task chain
context.results.push(result);
}
// decrement the number of running tasks
context.active -= 1;
// when all tasks complete, trigger done state
if(!context.active) {
sm.emit('done');
}
};
// operational states
// start state - initializes context
sm.on('start', (paths) => {
const len=paths.length;
console.log(`start: beginning processing of ${len} paths`);
context.tasks = len; // total number of tasks
context.active = len; // number of active tasks
sm.emit('forEachPath', paths); // go to next state
});
// start processing of each path
sm.on('forEachPath', (paths)=>{
console.log(`forEachPath: starting ${paths.length} process chains`);
paths.forEach((path) => sm.emit('readPath', path));
});
// read contents from path
sm.on('readPath', (path) => {
console.log(` readPath: ${path}`);
fs.readFile(path,(err,buf) => {
if(err) {
sm.emit('error',err);
return;
}
sm.emit('processContent', buf.toString(), path);
});
});
// compute length of path contents
sm.on('processContent', (str, path) => {
console.log(` processContent: ${path}`);
next(str.length);
});
// when processing is complete
sm.on('done', () => {
const total = context.results.reduce((sum,n) => sum + n);
console.log(`The total of ${context.tasks} files is ${total}`);
});
// error state
sm.on('error', (err) => { throw err; });
// ======================================================
// start processing - ok, let's go
// ======================================================
sm.emit('start', ['file1','file2','file3','file4']);
Which will output:
start: beginning processing of 4 paths
forEachPath: starting 4 process chains
readPath: file1
readPath: file2
processContent: file1
readPath: file3
processContent: file2
processContent: file3
readPath: file4
processContent: file4
The total of 4 files is 4021
Note that the ordering of the process chain tasks is dependent upon system load.
You can envision the program flow as:
start -> forEachPath -+-> readPath1 -> processContent1 -+-> done
+-> readFile2 -> processContent2 -+
+-> readFile3 -> processContent3 -+
+-> readFile4 -> processContent4 -+
For reuse, it would be trivial to create a module to support the various flow-control patterns, i.e. series, parallel, batch, while, until, etc.
The simplest solution is to run the do_something* and unlink in sequence as follows:
do_something(tmp_file_name, function(err) {
do_something_other(tmp_file_name, function(err) {
fs.unlink(tmp_file_name);
});
});
Unless, for performance reasons, you want to execute do_something() and do_something_other() in parallel, I suggest to keep it simple and go this way.
Wait.for https://github.com/luciotato/waitfor
using Wait.for:
var wait=require('wait.for');
...in a fiber...
wait.for(do_something,tmp_file_name);
wait.for(do_something_other,tmp_file_name);
fs.unlink(tmp_file_name);
With pure Promises it could be a bit more messy, but if you use Deferred Promises then it's not so bad:
Install:
npm install --save #bitbar/deferred-promise
Modify your code:
const DeferredPromise = require('#bitbar/deferred-promise');
const promises = [
new DeferredPromise(),
new DeferredPromise()
];
do_something(tmp_file_name, (err) => {
if (err) {
promises[0].reject(err);
} else {
promises[0].resolve();
}
});
do_something_other(tmp_file_name, (err) => {
if (err) {
promises[1].reject(err);
} else {
promises[1].resolve();
}
});
Promise.all(promises).then( () => {
fs.unlink(tmp_file_name);
});

Resources