I have some code in node, basically it is making api calls to external service and dump the returned data into a database. But it must have some serious memory leak since the node server will run out of memory in the middle. The AWS instance I am using is 2CPU, 4GB RAM. I spent a lot of time to figure out where is the leak with no luck yet. Below is the code, any hint will be helpful.
function refreshSitesBy5Min(rawData, callback){
var sites = JSON.parse(rawData).data;
if (typeof sites !== 'undefined' && sites.length > 0){
log.info('refreshing sites 5min');
sites.forEach(function(elem, index, array){
db.site_5min.find({siteid: elem.id, ts : moment(elem.ts).format('YYYY-MM-DDThh:mm:ss')}, function(err, found){
if (typeof found === 'undefined' || found == null || found.length == 0){
db.site_5min.save({
siteid : elem.id,
gran : '5min',
ts : moment(elem.ts).format('YYYY-MM-DDThh:mm:ss'),
wh_sum : elem.Wh_sum
}, function(err, inserted){
if (err){
log.error(err);
}
});
}
else{
db.site_5min.save({
id: found.id,
siteid : elem.id,
gran : '5min',
ts : moment(elem.ts).format('YYYY-MM-DDThh:mm:ss'),
wh_sum : elem.Wh_sum
}, function(err, updated){
if (err){
log.error(err);
}
})
}
})
})
}
else{
log.warn('no sites data');
}
callback();
}
and this is the code to call previous method:
function refreshSiteByGran(globalToken, gran, frequency){
log.info('refreshing site for ' + gran + ' table');
// db.site.find({}, function(err, sites){
db.run("select * from site", function(err, sites){
if (err){
log.error(err);
}
if (sites){
function handler(i){
if (i < sites.length){
var thePath = '/v3/sites/' + sites[i].siteid + '/data?fields=Wh_sum&tz=US/Pacific&gran=' + gran;
var end = moment().subtract(1, 'days').format('YYYY-MM-DDThh:mm:ss');
var start;
if (gran === '5min' || gran === 'hourly'){
start = moment(end).subtract(frequency, 'days').format('YYYY-MM-DDThh:mm:ss');
}
else if (gran === 'daily'){
start = moment(end).subtract(frequency, 'days').format('YYYY-MM-DDThh:mm:ss');
}
else if (gran === 'monthly'){
start = moment(end).subtract(frequency, 'months').format('YYYY-MM-DDThh:mm:ss');
}
thePath = thePath + '&start=' + start + '&end=' + end;
log.warn('thePath: ' + thePath);
var options = locusUtil.setOptions(thePath, globalToken.token.access_token);
request(options, function(err, result, body){
if (err){
log.error(err + ' path: ' + thePath);
}
if (body && JSON.parse(body).statusCode == 401){
getLocusToken(function(){
setTimeout(function(){
handler(i);
}, 2000);
})
}
else if (body && JSON.parse(body).statusCode == 200){
var data = JSON.parse(body).data;
// log.info('any data? ' + JSON.stringify(body, null, 4));
if (typeof data !== 'undefined' && data.length > 0){
if (gran === '5min'){
refreshSitesBy5Min(body, function(){
log.info('inserted: ' + data[0].id);
setTimeout(function(){
handler(i+1);
}, 2000);
})
}
if (gran === 'hourly'){
refreshSitesByHourly(body, function(){
log.info('inserted: ' + data[0].id);
setTimeout(function(){
handler(i+1);
}, 2000);
})
}
if (gran === 'daily'){
refreshSitesByDaily(body, function(){
log.info('inserted: ' + data[0].id);
setTimeout(function(){
handler(i+1);
}, 2000);
})
}
if (gran === 'monthly'){
refreshSitesByMonthly(body, function(){
log.info('inserted: ' + data[0].id);
setTimeout(function(){
handler(i+1);
}, 2000);
})
}
}
else{
setTimeout(function(){
handler(i+1);
}, 2000);
}
}
// re-try for concurrency error
else if (body && JSON.parse(body).statusCode == 429){
log.warn('error body ' + JSON.stringify(body));
setTimeout(function(){
handler(i);
}, 2000);
}
// if any other error, just skip
else {
setTimeout(function(){
handler(i+1);
}, 2000);
}
})
}
else{
return;
}
}
handler(0);
}
});
}
I believe the problem is inside this two blocks, I used memwatch to monitor v8 garbage collection, I see usage_trend is increasing fast, so it must have leaks.
This is very simple to solve...
First, get rid of the forEach loop, located here...
sites.forEach(function(elem, index, array){
Instead, create a recursive function that simply passes an index to the following iteration. What this does is create a loop that executes correctly in accordance to the given CPU and memory allotted. No need for process.nextTick() or any of that fancy jazz.
Asynchronous loops are not technically the answer, as they overload systems rather quickly with thousands of queues. Instead, iterate through each record, then only proceed to the next when the current process is finished.
Also, delete the current array index before proceeding to the next.
Eventually, the end of the loop is reached when the index returns "undefined". That is when the callback to the main function refreshSitesBy5Min is summoned.
function refreshSitesBy5Min(rawData, callback) {
var sites = JSON.parse(rawData).data
getSite(0)
function getSite(index) {
// we have reached the end
if(!sites[index])
return callback()
runProcess(sites[index]
// clear up memory after every iteration
delete sites[index]
// done with iteration, move on to the next
getSite(++index)
}
}
Still not done yet...
Big JSON Object
If your JSON object is massive, you will need to stream your JSON and handle tiny chunks at a time.
https://github.com/uhop/stream-json
Big Database Result Set
Your SQL query should utilize limit if you are returning more than 500 records results at a time, even smaller is better. So if your record set being returned is 100,000. Just grab 500 at a time in a recursive function, simply increment the index and multiply it by num_records, in this scenario: 500.
var offset = iter * 500
limit: [offset, 500]
Related
need some help in correcting a loop closure in javascript.
Required flow: MasterGet function is run, data is fetched from a mysql table, for each record fetched a set of 'rules' is run.
Issue faced: if there are two records fetched, while 'rules' are being run for record 1, the record 2 'rules' also get triggered. Need to modify code such at record 2 is checked only after the 'rules' action is completed for record 1.
function MasterGet() {
var countCheckRule = 0;
connection.query(
'SELECT * FROM MASTER',
function selectCb(error, rows, fields) {
if (error) {
console.log('Log 045 - GetData Error: ' + error.message);
return;
}
for (var i = 0; i < rows.length; i++) {
console.log(+ rows[i].INDEX1);
var firstResult = rows[i];
rules.checRules(firstResult, myhttp, function (rules_res) {
firstResult.rules = rules_res;
})
}
});
countCheckRule++;
setTimeout(funcL, 4000);
};
Any guidance will help. Thanks
Added to the issue:
The rules.checrules coding is as below:
exports.checRules = function (A, myhttp, _callback) {
var objrules = { 'rule12': false };
function rule11() {
if (A.NQ > 0 && A.PSQ > 0) {
objrules.rule11 = true;
if (config.execute) {
modifyOrder('S', 'A.BQ', A.TS);
}
} else {
objrules.rule11 = false;
}
}
rule11();
_callback(objrules);
}
So in the loop for 1st record, it checks rule11, and if rule11 is true then it has to execute 'modifyOrder' with the given variables, after 'modifyOrder' is completed, then go back to the loop and check for the 2nd record. If Rule11 is false for the 1st record, then it should automatically go back to the loop and check for the 2nd record.
Currently with the given changes, 2nd record check gets triggered before 'modifyOrder' is complete. Maybe the issue is that the code does not wait for the callback from 'modifyOrder'? Is that the issue? How can I make the code wait till 'modifyOrder' is complete if started.
This code may solve your problem.
This kind of problem accord bez of async nature of js.
function asyncLoop(i, rows, cb) {
if (i < rows.length) {
rules.checRules(rows[i], myhttp, function (rules_res) {
console.log(rules_res);
asyncLoop(i + 1, rows, cb);
//your code
});
} else {
cb();
}
}
function MasterGet() {
connection.query(
'SELECT * FROM MASTER',
function selectCb(error, rows, fields) {
if (error) {
console.log('Log 045 - GetData Error: ' + error.message);
return;
}
asyncLoop(0, rows, () => {
//after async loop complete...
});
});
countCheckRule++;
setTimeout(funcL, 4000);
}
I'm running a series of unit tests (node.js 4.x, aws-sdk, mocha) which load data into a table before each test then clears the table after the test.
I have two tests that are failing because of a ConditionExpression which triggers a ConditionCheckFailedException. But if I increase the read/write capacity they tests pass.
It's my understanding that the SDK handles throttling exceptions and retries them for you so why wouldn't my tests just run slower and pass? Instead it seems as though the tests fail to complete the scan -> batchWriteItem process and so there are records still left in the table when a new tests starts.
I'm told by team members that they've seen similar problems and they just increased the throughput to fix the problem. This doesn't sit right with me. Either I'm doing something wrong and there's a race condition with my tests or there should be a pattern I can implement to make sure that my operations complete when being throttled? I should be able to use throttling metrics to inform when I need to increase throughput but I should still be able to keep retrying until I run out of memory.
Has anyone else run into this and what have you done to handle the problem?
After some debugging I noticed the UnprocessedItems response element. After looking up UnprocessedItems in the docs I realize I should have read more closely. The code below will run a retry loop with a delay (exponential back-off):
var clearEventTable = function (tableName, client, cleared) {
var exclusiveStartKey = null;
var retryCount = 0;
var read = function(query, callback) {
client.scan(query, function (err, page) {
if(err) {
console.log(err);
return callback(err);
}
retryCount = 0;
exclusiveStartKey = page.LastEvaluatedKey || null;
if(page.Count == 0) {
return callback(null, {});
}
if(page.Count < 25 && exclusiveStartKey) {
console.log("read capacity limit reached: " + JSON.stringify(page, null, 2));
}
var keys = _.map(page.Items, function(n) {
return { DeleteRequest: { Key: n } };
});
var batch = {
RequestItems: {},
ReturnConsumedCapacity: "INDEXES",
ReturnItemCollectionMetrics: "SIZE"
};
batch.RequestItems[tableName] = keys;
callback(null, batch);
});
};
var write = function(batch, callback) {
if(batch && batch.RequestItems){
client.batchWriteItem(batch, function(err, result) {
if(err) {
console.log(err);
return callback(err);
}
if(Object.keys(result.UnprocessedItems).length !== 0) {
console.log("Retry batchWriteItem: " + JSON.stringify(result, null, 2));
retryCount++;
var retry = {
RequestItems: result.UnprocessedItems,
ReturnConsumedCapacity: "INDEXES",
ReturnItemCollectionMetrics: "SIZE"
};
// retry with exponential backoff
var delay = retryCount > 0 ? (50 * Math.pow(2, retryCount - 1)) : 0;
setTimeout(write(retry, callback), delay);
return;
}
callback(null, result);
});
} else {
callback(null);
}
};
var params = {
TableName: tableName,
ProjectionExpression: "aggregateId,id",
Limit: 25, // max 25 per batchWriteItem
ConsistentRead: false,
ReturnConsumedCapacity: "TOTAL"
};
async.doWhilst(function (next) {
// retrieve entities
if (exclusiveStartKey)
params.ExclusiveStartKey = exclusiveStartKey;
async.compose(write, read)(params, function (err, result) {
if (err) next(err);
else next(null, result);
});
}, function () {
// test if we need to load more
return exclusiveStartKey !== null;
}, function (err, r) {
// return results
if (err) {
console.log(err);
return cleared(err);
}
return cleared(null);;
});
};
Also take a look at the amount of memory provisioned for the Lambda. Might be too low and hitting the max leads to unpredictable results IMX.
I tried use async.
I have a rout function with a async.waterfall.
The 1st function call an external function and fetch all users in usersData
The 2nd function, via async.each, it call an external function to search infos for each user.
I want pass again usersData with the news values to the 3th function.
In the 3th function, for the moment, I have a async.each and I watch the datas for each user.
My issues
1) In the second function, I don't fetch the information for each user.
2) The 3th function is called before the 2nd and I don't fetch the new data
Thanks
router.post('/launch',function(req,res,next){
async.waterfall([
function(cb){
// fetch the global users
fetchUsers(usersData,cb);
},
function(usersData,cb){
async.each(usersData,
function(userdata,cb){
// fetch other data for each user
calcBalance(userdata, cb);
},function(err){
cb(err,usersData);
});
},
function(usersData,cb){
async.each(usersData,
function(userdata,cb) {
//watch the info with the news data
console.log(' 2 '+ JSON.stringify(userdata));
//console.log(3);
}
);
},
],
function(err,results){
console.log('Fin' + JSON.stringify(results));
res.render('synchros',{launch:'end'},results);
});
res.render('synchros',{launch:'end'});
});
function calcBalance(userData,cb){
var user_id=userData.id,
resultCalcBalance=0,
cats_id=[3,4,6],
tabData={};
async.each(cats_id,function(cat_id,cb){
switch (cat_id) {
case 3:
var comp = "<=";
break;
case 4:
var comp = "<=";
break;
case 6:
var comp = "<";
break;
}// fin du switch
var myquery = "select blabla+
//console.log(calcul_balance);
connectionMysql.query(myquery, function (err, rows, fields,cb) {
if (err) {
console.log('Error ' + err);
cb(err);
}
else if (rows.length != 0) {
if (rows != 0) {
}// end if
else {
}// end else
}); // end connectionMysql
},function(err){
cb(err,userData); // ?? I send the data here
});
cb(null, userData); // ?? I send the data here ??
}
I reindented, fixed some typos, and changed the names of the callbacks. I changed the second async.each to async.map because you're processing an array to get a set of one result per item.
The first problem was in the second to last line. You were calling back too early from calcBalance.
Another potential problem was an ambiguous callback name cb in the second waterfall function (as well as in calcBalance.)
Finally, you never ran the async.each callback in the third waterfall function, and if you calledback out of it, it was accidental.
You still aren't ever reporting success from one database query, so you will need to call done() if it worked. You might also want to use async.map for the database calls, this would let you assemble the results, like done(null, balanceForCategory)
router.post('/launch', function(req, res, next){
async.waterfall([
function(done){
// fetch the global users
fetchUsers(usersData,done);
},
function(usersData,done){
async.map(usersData, function(userdata, done2){
// fetch other data for each user
calcBalance(userdata, done2);
},function(err, results){
done(err,usersData);
});
},
function(usersData, done){
async.each(usersData, function(userdata, done2) {
//watch the info with the news data
console.log(' 2 '+ JSON.stringify(userdata));
//console.log(3);
}, done)
},
],
function(err, results){
// results will be undefined because we called done() from the above async.each
console.log('Fin' + JSON.stringify(results));
res.render('synchros', {launch:'end'}, results);
}); // end of async.each
}); // end of router.post()
function calcBalance(userData, callback){
var user_id=userData.id,
resultCalcBalance=0,
cats_id=[3,4,6],
tabData={};
async.each(cats_id, function(cat_id, done){
switch (cat_id) {
case 3:
var comp = "<=";
break;
case 4:
var comp = "<=";
break;
case 6:
var comp = "<";
break;
}// fin du switch
var myquery = "select blabla";
//console.log(calcul_balance);
connectionMysql.query(myquery, function (err, rows, fields, queryCb) { // what is this queryCb param?
if (err) {
console.log('Error ' + err);
queryCb(err); // This will callback whatever mySql passed in as queryCb
// done(err) // This will callback out of the async.each iterator and immediately the async.each callback
// callback(err) // This will callback out of calcBalance and continue executing
// return callback(err); // This will callback out of calcBalance and stop executing
} else if (rows.length != 0) {
if (rows != 0) {
// Your code might hang here without calling a callback
} else {
// Your code might hang here without calling a callback
}
}); // end connectionMysql
},function(err){
// Inside async.each callback. Either everything worked or something broke
callback(err,userData); // Send the data back out of calcBalance
});
//callback(null, userData); // Running this here will IMMEDIATELY call back before async.each runs
}
The calcBalance function
function calcBalance(userData,callback){
// Ensuite on va calculer les rtt_balances et holiday_balances et yesterday_extra_hours_month
var user_id=userData.id,
resultCalcBalance=0,
cats_id=[3,4,6],
tabData={},
dateJour=moment().format('YYYY-M-D');;
async.each(cats_id,function(cat_id,done){
switch (cat_id) {
case 3:
var comp = "<=";
break;
case 4:
var comp = "<=";
break;
case 6:
var comp = "<";
break;
}// fin du switch
var calcul_balance = "select * from table1"
connectionMysql.query(calcul_balance, function (err, rows, fields,queryCb) {
if (err) {
queryCb(err); // This will callback whatever mySql passed in as queryCb
// done(err) // This will callback out of the async.each iterator and immediately the async.each callback
// callback(err) // This will callback out of calcBalance and continue executing
// return callback(err); // This will callback out of calcBalance and stop executing
console.log('Error ' + err);
queryCb(err);
}
else if (rows.length != 0) {
if (rows != 0) {
// On va chercher les valuers sinon on les laisse à zéro par défaut.
for (var j = 0; j < rows.length; j++) {
if (!isNaN(rows[j].amount) && rows[j].amount != null) {
resultCalcBalance += parseInt(Math.round(rows[j].amount * 100) / 100);
//console.log('ResultCalculBalance 1chiffre ' + parseInt(Math.round(rows[j].amount*100)/100) + ' 2chiffre' + resultCalcBalance);
} else {
resultCalcBalance += 0;
//console.log('ResultCalculBalance 2' + JSON.stringify(rows[j].amount));
}
} // fin du for k
//console.log('Resultat : ' + userData.id + ' ' + cat_id + ' ' + resultCalcBalance);
if (cat_id == 3) userData.holiday_balance = resultCalcBalance;
if (cat_id == 4) userData.rtt_balance = resultCalcBalance;
if (cat_id == 6) userData.yesterday_extra_hours_month = resultCalcBalance;
}// fin du if
else {
if (cat_id == 3) userData.holiday_balance = 0;
if (cat_id == 4) userData.rtt_balance = 0;
if (cat_id == 6) userData.yesterday_extra_hours_month = 0;
}// fin du else
}// de la condition err ou pas
console.log('1 '+JSON.stringify(userData));
});
},function(err){
callback(err,userData);
});
//callback(null, userData); // Running this here will IMMEDIATELY call back before async.each runs
I may be over tired but for the life of me I cannot understand why the following is not working. I am trying to search if a string exists and if it does not, add it to a redis database
options = options || {};
var counter = 1,
client = redis.getClient();
options.name = options.name || '';
if (_.isEmpty(options.name)) {
return callback('Cannot add name. No name supplied');
} else {
options.name = options.name.trim();
}
client.get('mySavedKeys' + options.name, function (err, data) {
if (err) {return callback(err); }
if (!_.isNull(data)) {
console.log('Name found', options.name);
return callback(null, data);
} else {
counter += 1;
console.log('Name not found', options.name);
console.log('ID', counter)
client2.set('mySavedKeys' + options.name, counter, function (err) {
if (err) {return callback(err); }
console.log('Added', options.name);
return callback(null, counter);
});
}
});
If I run an array of names to add using async.each then it seems to run all the 'get' functions and then run the 'set' function so I am getting duplicate insertions.
I'm sure the answer is obvious but I cannot see the problem.
If you use async.eachSeries you would insure that the get/set happen atomically rather than all gets running in parallel.
I am currently trying to iterate through an array of JSON elements, parse and add the data I need into a specially formatted string, and once conditions are met, initiate the uploading of this data.
The problem that I am running into, however, is my variable 'deviceOutString' is being returned as undefined, leaving me with a string of 'undefined' written as many time as there are JSON elements in the array. I know that the return from the 'checkDuplicates' function is correct because before returning the value, the logs show that the value is correct.
I have attached my code below, please let me know if you have any ideas.
Thanks!
Old Code (updated below)
var i=0;
var parsedJson = JSON.parse(storedData) ;
var storedDataSize = parsedJson.length;
console.log('Stored Data Size: '+storedDataSize);
var async = require('async');
async.each(parsedJson, function( subElemJson, callback1) {
async.series([
function(callback){
console.log('dstring: ' + deviceOutString);
console.log('i : ' + i);
var subElemJsonPretty = JSON.stringify(subElemJson,null,0) ;
var date = subElemJson['date'];
deviceOutString += checkDuplicates(subElemJson, deviceOutString);
console.log('theLoop*DString: ' + deviceOutString);
callback(null, 'one');
},
function(callback){
if((i == storedDataSize - 1 || count == 225) && storedDataSize > 0) {
writeDCC(deviceOutString);
count = 0;
makeList();
}
i++;
callback(null, 'two');
setTimeout(function () { callback1(); }, 500);
}
]);
}); }
Updated New Code
function theLoop(storedData) {
var deviceOutString = '<list>';
var temp;
try {
var i=0;
var parsedJson = JSON.parse(storedData) ;
var storedDataSize = parsedJson.length;
console.log('Stored Data Size: '+storedDataSize);
var async = require('async');
var delayed = require('delayed');
async.each(parsedJson, function( subElemJson, callback1) {
async.series([
function(callback){
var subElemJsonPretty = JSON.stringify(subElemJson,null,0) ;
var date = subElemJson.date;
console.log('THIS IS THE DATE: '+date);
temp = checkDuplicates(subElemJson, deviceOutString);
console.log('This is the temp: ' + temp);
callback(null, temp);
}
], function(results){
console.log('*****Results are In*****: ' + results);
deviceOutString =+ temp;
if((i == storedDataSize - 1 || count == 225) && storedDataSize > 0) {
writeDCC(deviceOutString);
count = 0;
deviceOutString = '<list>';
}
i++;
callback1(results);
});
},
function(err){
if( err ) {
console.log('A file failed to process');
} else {
console.log('All files have been processed successfully');
}
});
} catch (error) {
console.info('Exception parsing ' + '\n\n' + error);
return;
}
}
So a few things
1: var date = subElemJson['date']; accessing object properties via array syntax is a bad practice. Nit picky but hey :P try var data = subElemJson.date; instead.
2: deviceOutString isn't defined anywhere in the code you provided.
3: Both async.series and async.each are going to want a callback function for when each is finished. that's the whole point of calling callback(null, 'one'); -- that you pass a value to the "results" array in the final async.series callback. You are calling setTimeout(function() { callback1(); }, 500); in the wrong place (also arbitrarily putting it behind a timeout?).
The proper async.series formatting is thus:
async.series([
function(callback) {
// do stuff
callback(null, someValue);
},
function(callback) {
// do other stuff
callback(null, someOtherValue);
}
], function(results) {
// all the stuffs are done
console.log(results); <-- results is an array containing "someValue" and "someOtherValue" from the iterations above
callback1(results);
});
Also, async.each is in the same boat -- it expects you to pass a "every thing I'm looping through has completed now!" function at the end.
Async docs on .each() (scroll down for docs on .series()): https://github.com/caolan/async#each