Node.js Readable stream slows over time, CPU usage falls - node.js

I am trying to launch a cluster that will stream files (new line delimited JSON) from google cloud storage and transform each row after fetching data from MongoDB. After transforming the row, i want to store it in Google's bigquery - 10000 rows at a time. All of this is working fine but the issue is that the rate at which the streamed files are being processed decreases significantly over time.
I have setup the node application on one server and mongodb on another. Both 8 core machines with 30GB RAM. When the script is executed, initially the CPU usage for the application server and mongodb server is around 70%-75%. After 30 minutes, the CPU usage falls down to 10% and then finally 1%. The application generates no exceptions. I can see the application log and find that it finished processing a few files and took up new files for processing. One execution can be observered below a little later than 3:00PM and a almost upto 5:20PM.
var cluster = require('cluster'),
os = require('os'),
numCPUs = os.cpus().length,
async = require('async'),
fs = require('fs'),
google = require('googleapis'),
bigqueryV2 = google.bigquery('v2'),
gcs = require('#google-cloud/storage')({
projectId: 'someproject',
keyFilename: __dirname + '/auth.json'
}),
dataset = bigquery.dataset('somedataset'),
bucket = gcs.bucket('somebucket.appspot.com'),
JSONStream = require('JSONStream'),
Transform = require('stream').Transform,
MongoClient = require('mongodb').MongoClient,
mongoUrl = 'mongodb://localhost:27017/bigquery',
mDb,
groupA,
groupB;
var rows = [],
rowsLen = 0;
function transformer() {
var t = new Transform({ objectMode: true });
t._transform = function(row, encoding, cb) {
// Get some information from mongodb and attach it to the row
if (row) {
groupA.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.OA_SA': 1, _id: 0 }
}, function(err, a) {
if (err) return cb();
groupB.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.WZ11CD': 1, _id: 0 }
}, function(err, b) {
if (err) return cb();
row.groupA = a ? a.properties.OA_SA : null;
row.groupB = b ? b.properties.WZ11CD : null;
// cache processed rows in memory
rows[rowsLen++] = { json: row };
if (rowsLen >= 10000) {
// batch insert rows in bigquery table
// and free memory
log('inserting 10000')
insertRowsAsStream(rows.splice(0, 10000));
rowsLen = rows.length;
}
cb();
});
});
} else {
cb();
}
};
return t;
}
var log = function(str) {
console.log(str);
}
function insertRowsAsStream(rows, callback) {
bigqueryV2.tabledata.insertAll({
"projectId": 'someproject',
"datasetId": 'somedataset',
"tableId": 'sometable',
"resource": {
"kind": "bigquery#tableDataInsertAllRequest",
"rows": rows
}
}, function(err, res) {
if (res && res.insertErrors && res.insertErrors.length) {
console.log(res.insertErrors[0].errors)
err = err || new Error(JSON.stringify(res.insertErrors));
}
});
}
function startStream(fileName, cb) {
// stream a file from Google cloud storage
var file = bucket.file(fileName),
called = false;
log(`Processing file ${fileName}`);
file.createReadStream()
.on('data', noop)
.on('end', function() {
if (!called) {
called = true;
cb();
}
})
.pipe(JSONStream.parse())
.pipe(transformer())
.on('finish', function() {
log('transformation ended');
if (!called) {
called = true;
cb();
}
});
}
function processFiles(files, cpuIdentifier) {
if (files.length == 0) return;
var fn = [];
for (var i = 0; i < files.length; i++) {
fn.push(function(cb) {
startStream(files.pop(), cb);
});
}
// process 3 files in parallel
async.parallelLimit(fn, 3, function() {
log(`child process ${cpuIdentifier} completed the task`);
fs.appendFile(__dirname + '/complete_count.txt', '1');
});
}
if (cluster.isMaster) {
for (var ii = 0; ii < numCPUs; ii++) {
cluster.fork();
}
} else {
MongoClient.connect(mongoUrl, function(err, db) {
if (err) throw (err);
mDb = db;
groupA = mDb.collection('groupageo');
groupB = mDb.collection('groupbgeo');
processFiles(files, process.pid);
// `files` is an array of file names
// each file is in newline json delimited format
// ["1478854974993/000000000000.json","1478854974993/000000000001.json","1478854974993/000000000002.json","1478854974993/000000000003.json","1478854974993/000000000004.json","1478854974993/000000000005.json"]
});
}

Okay, I have found the culprit! Google APIs Node.js client library makes use of a module called "stream-events" which implements Streams 0.8. Streams 0.8 do not control the rate at which it emits the 'data' event based on the consumer's ability to consume data. The rate controlling feature was introduced in Streams 1.0. So this essentially meant that the readable stream was throwing data at MongoDB at a rate that it was not able to process.
Solution:
I used the 'request' module instead of Google's client library. I supplied a signed URL to the request module which in turn fetched results as a stream which I could pipe into my transformer.
Take Away:
Always check the modules you use for the stream versions they are using.

Related

async.waterfall randomly sorts results

I was writing a nest code, tried using async.waterfall or async.series but I am getting random results every time I refresh. it seems to be because of the queries of the first 2 functions randomly finishing.
first query was sorting to committed_date DESC. but when I add the 2 sub queries. the sort gets distorted.
Step1: Loop landingpages
Step1.1 - fetch details1 repositories
Step1.2 - fetch details2 versions
Step2: build array
db.collection('landingpages').find({is_deleted:{$ne:1}}).sort({committed_date:-1}).limit(10).toArray(function(err, db_results) {
var data_array = [];
var x=1;
if(db_results.length == 0) {
return_data.lps = data_array;
parallel_done();
}else{
async.each(db_results, function (db_results1, cb) {
async.watefall(
[
function(callback) {
//if this is removed or passed as callback(null, ""); the sort is fixed from committed - 1
var data_repo = {};
db.collection('repositories').find({repository_id: repository_id}).toArray(function(err, db_results1) {
if(db_results1.length == 0){
var data_repo = {};
callback(null, data_repo);
}else{
var data_repo = db_results1[0];
callback(null, data_repo);
}
});
},
function(callback) {
//if this is removed or passed as callback(null, ""); the sort is fixed from committed - 1
var data_version = {};
db.collection('versions').find({landingpage_id: landingpage_id}).sort({_id:-1}).limit(1).toArray(function(err, db_results1) {
if(db_results1.length == 0){
var data_version = {};
callback(null, data_version);
}else{
var data_version = db_results1[0];
callback(null, data_version);
}
});
}
],
function (err, data_repo,data_version) {
var document = {
"x": x++,
"landingpage_id": db_results1.landingpage_id,
"repository_id": db_results1.repository_id,
"version_id": data_version.version_id,
"val": db_results1,
"data_repo": data_repo,
"data_version": data_version,
};
data_array.push(document);
if(data_array.length == db_results.length) {
return_data.lps = data_array;
}
}
);
});
}
});

Uploading series of large files in node to a web API

I am trying to create a node script to traverse a folder of large files (mp3s containing 1 hour music sets) and upload them to Mixcloud via their API. It works except in the case where it hits the API rate limit and needs to wait x seconds - I set a timeout and loop back to beginning, but after the set amount of time, the process exits and doesn't log any useful output. Is this due to a bug in my code, or the way I'm running it? I'm currently running it with sudo nohup node index.js > log.log, is this incorrect? my end goal is a script that can be run at the end of every day of our radio station and upload the archived files.
const restler = require('restler');
const fs = require('fs');
const readDir = require('readdir');
const powerOff = require('power-off');
const options = {
folder: 'files',
completefolder: 'complete',
accesstoken: 'xxxxxxxxxxxxxxxxxxxxxxx'
}
// shut down computer
const shutDown = () => {powerOff((err, stderr, stdout) => {
if(!err && !stderr) {
console.log(stdout);
}
})
};
// uploadFile uploads file with restler to mixcloud, if api returns rate limiting object, try again in x seconds
const uploadFile = (folder, filename) => {
const filepath = `./${folder}/${filename}`
fs.stat(`./${folder}/${filename}`, function(err, stats) {
const size = stats.size;
restler.post(`https://api.mixcloud.com/upload/?access_token=${options.accesstoken}`, {
multipart: true,
data: {
"mp3": restler.file(`./${folder}/${filename}`, null, size, null, 'audio/mpeg'),
"name": filename,
// "unlisted": true
// more data can be added here depending on changes in workflow, automate images etc
}
}).on("complete", function(data) {
const returned = JSON.parse(data);
if (returned.error) {
if (returned.error.type == "RateLimitException") {
// try again in x seconds
console.log(`uploading too fast, retrying upload of ${filename}after ${returned.error.retry_after} seconds`);
setTimeout(() => {(folder, filename) => uploadFile}, returned.error.retry_after*1000);
}
else {
console.log('non-rate-limiting error');
console.log(returned);
}
}
else {
console.log('Success!');
console.log(returned);
// move uploaded files into completed folder
fs.rename(`./${folder}/${filename}`, `./${options.completefolder}/${filename}`, (err)=> {
if (err) {
console.log(err)
}
else {
counter += 1;
console.log(counter);
if (counter === files.length) {
console.log('done');
shutDown();
}
}
})
}
});
});
};
// get all mp3s and upload all of them
const files = readDir.readSync(`./${options.folder}`, ['**.mp3'] );
let counter = 0;
for (var i = 0; i < files.length; i++) {
uploadFile(options.folder, files[i])
};

How to perform mass inserts into mongodb using NodeJS

I Have to Insert about 10,00000 documents in mongodb using nodejs.
I'm generating these documents using a for loop storing them into an array before finally inserting them into mongodb.
var codeArray = new Array();
for (var i = 0; i<1000000; i++){
var token = strNpm.generate();
var now = moment().format('YYYYMMDD hhmmss');
var doc1 = {id:token,
Discount_strId:"pending",
Promotion_strCode:token,
Promotion_strStatus:"I",
Promotion_dtmGeneratedDate:now,
User_strLogin:"test",
Promotion_strMode:"S",
Promotion_dtmValidFrom:"pending",
Promotion_dtmValidTill:"pending",
LastModified_dtmStamp:now
};
codeArray.push(doc1);
db.collection('ClPromoCodeMaster').insert(codeArray, function (err, result) {
if (err){
console.log(err);
}else{
console.log('Inserted Records - ', result.ops.length);
}
});
The problem I'm facing is mongo has an inserting limit of 16mb, so I can't insert the entire array at once.
Please suggest most optimum solutions.
The main problem is in the request size and not the document size, but it amounts to the same limitation. Bulk operations and the async library with async.whilst will handle this:
var bulk = db.collection('ClPromoCodeMaster').initializeOrderedBulkOp(),
i = 0;
async.whilst(
function() { return i < 1000000; },
function(callback) {
var token = strNpm.generate();
var now = moment().format('YYYYMMDD hhmmss');
var doc = {
id:token,
Discount_strId:"pending",
Promotion_strCode:token,
Promotion_strStatus:"I",
Promotion_dtmGeneratedDate:now,
User_strLogin:"test",
Promotion_strMode:"S",
Promotion_dtmValidFrom:"pending",
Promotion_dtmValidTill:"pending",
LastModified_dtmStamp:now
};
bulk.insert(doc);
i++;
// Drain every 1000
if ( i % 1000 == 0 ) {
bulk.execute(function(err,response){
bulk = db.collection('ClPromoCodeMaster').initializeOrderedBulkOp();
callback(err);
});
} else {
callback();
}
},
function(err) {
if (err) throw err;
console.log("done");
}
);
I should note that regardless there is an internal limit on bulk operations to 1000 operations per batch. You can submit in larger sizes, but the driver is just going to break these up and still submit in batches of 1000.
The 1000 is a good number to stay at though, since it is already in line with how the request will be handled, as well as being a reasonable number of things to hold in memory before draining the request queue and sending to the server.
For inserting millions of record at a time, Create node.js child process fork with MongoDb bulk api.
Child Process Creation:(index.js)
const {fork} = require("child_process");
let counter = 1;
function createProcess(data){
const worker = fork("./dbOperation");
worker.send(data);
worker.on("message", (msg) => {
console.log("Worker Message :",counter, msg);
counter++;
})
}
function bulkSaveUser(records) {
const singleBatchCount = 10000; // Save 10,000 records per hit
const noOfProcess = Math.ceil(records/singleBatchCount);
let data = {};
console.log("No of Process :", noOfProcess);
for(let index = 1; index <= noOfProcess; index++) {
data.startCount = (index == 1) ? index : (((index - 1) * singleBatchCount) + 1);
data.endCount = index * singleBatchCount;
createProcess(data);
}
}
bulkSaveUser(1500000);
DB Operation (dbOperation.js)
const MongoClient = require('mongodb').MongoClient;
// Collection Name
const collectionName = "";
// DB Connection String
const connString = "";
process.on("message", (msg) => {
console.log("Initialize Child Process", msg)
const {startCount, endCount} = msg;
inputStudents(startCount, endCount);
});
function initConnection() {
return new Promise(function(r, e) {
MongoClient.connect(connString, function(err, db) {
if (err) e(err)
r(db);
});
});
}
function inputStudents(startCount, endCount) {
let bulkData = [];
for(let index = startCount; index <= endCount; index++ ){
var types = ['exam', 'quiz', 'homework', 'homework'];
let scores = []
// and each class has 4 grades
for (j = 0; j < 4; j++) {
scores.push({'type':types[j],'score':Math.random()*100});
}
// there are 500 different classes that they can take
class_id = Math.floor(Math.random()*501); // get a class id between 0 and 500
record = {'student_id':index, 'scores':scores, 'class_id':class_id};
bulkData.push({ insertOne : { "document" : record } })
}
initConnection()
.then((db) => {
const studentDb = db.db("student");
const collection = studentDb.collection(colName)
console.log("Bulk Data :", bulkData.length);
collection.bulkWrite(bulkData, function(err, res) {
if (err) throw err;
//console.log("Connected Successfully",res);
process.send("Saved Successfully");
db.close();
});
})
.catch((err) => { console.log("Err :", err) });
}
Sample project to insert millions of record in mongodb using child process fork

Is it possible to build a dynamic task list in nodejs Async (waterfall, series, etc...)

I am pulling information from some collections in mongo that contain node and edge data. First i must get the node so i can grab its edges. Once i have a list of edges i then go back out and grab more nodes (etc.. based on a depth value). The following code is an loose example of how i am attempting to use async.waterfall and the task list.
Initially i have only a single task but once i make my first query i add to the task array. Unfortunately this does not seem to register with async and it does not continue to process the tasks i am adding.
Is there a better way to do this?
var async = require('async')
var mongoose = require('mongoose')
var _ = requrie('underscore')
var Client = this.Mongo.connect(/*path to mongo*/)
var Node = mongoose.Schema({
id : String,
graph_id : String
})
var Edge = mongoose.Schema({
id : String,
source_id : String,
destination_id : String
})
var Nodes = Client.model('myNode', Node)
var Edges = Client.model('myEdge', Edge)
var funcs = []
var round = 1
var depth = 2
var query = {
node : {
id : '12345'
},
edge : {
id : '12345'
}
}
var addTask = function(Nodes, Edges, query, round, depth) {
return function(callback) {
queryData(Nodes, Edges, query, function(err, node_list) {
if(depth > round) {
round++
function_array.push(addTask(Nodes, Edges, query, round, depth))
}
})
}
}
var queryData = function(Nodes, Edges, query, cb) {
async.waterfall([
function(callback) {
Nodes.find(query.node, function(err, nodes) {
var node_keys = _.map(nodes, function(node) {
return node.id
})
callback(null, nodes, node_keys)
})
},
function(nodes, node_keys, callback) {
query.edge.$or = [ {'source_id' : {$in:node_keys}}, {'destination_id' : {$in:node_keys}} ]
Edges.find(query.edge, function(err, edges) {
var edge_keys = _.map(edges, function(edge) {
if(edge['_doc']['source_id'] != query.node.id) {
return edge['_doc']['source_id']
} else {
return edge['_doc']['destination_id']
}
callback(null, nodes, edges, node_keys, edge_keys)
})
})
}
], function(err, nodes, edges, node_keys, edge_keys) {
// update the results object then...
cb(null, _.uniq(edge_keys)
})
}
var function_array = []
function_array.push(addTask(Nodes, Edges, query, round, depth))
async.waterfall(function_array, function(err) {
Client.disconnect()
//this should have run more than just the initial task but does not
})
--------------------- UPDATE ---------------------------
So after playing around with trying to get Async waterfall or series to do this by adding trailing functions I decided to switch to using async.whilst and am now happy with the solution.
function GraphObject() {
this.function_array = []
}
GraphObject.prototype.doStuff = function() {
this.function_array.push(this.buildFunction(100))
this.runTasks(function(err) {
console.log('done with all tasks')
}
}
GraphObject.prototype.buildFunction = function(times) {
return function(cb) {
if(times != 0) {
this.function_array.push(this.buildFunction(times - 1))
}
cb(null)
}
}
GraphObject.prototype.runTasks = function(cb) {
var tasks_run = 0
async.whilst(
function(){
return this.function_array.length > 0
}.bind(this),
function(callback) {
var func = this.function_array.shift()
func.call(this, function(err) {
tasks_run++
callback(err)
})
}.bind(this),
function(err) {
console.log('runTasks ran '+tasks_run+' tasks')
if(err) {
cb(500)
}
cb(null)
}.bind(this)
)
}
A task in your function_array can only add a new task to the array provided it is NOT the last task in the array.
In your case, your function_array contained only 1 task. That task itself cannot add additional tasks since it's the last task.
The solution is to have 2 tasks in the array. A startTask to bootstrap the process, and a finalTask that is more of a dummy task. In that case,
function_array = [startTask, finalTask];
Then startTask would add taskA, taskB will add task C and eventually
function_array = [startTask, taskA, taskB, taskC, finalTask];
The sample code below that illustrates the concepts.
var async = require('async');
var max = 6;
var nodeTask = function(taskId, value, callback){
var r = Math.floor(Math.random() * 20) + 1;
console.log("From Node Task %d: %d", taskId, r);
// add an edge task
if (taskId < max) {
function_array.splice(function_array.length-1, 0, edgeTask);
}
callback(null, taskId + 1, value + r);
};
var edgeTask = function(taskId, value, callback){
var r = Math.floor(Math.random() * 20) + 1;
console.log("From Edge Task %d: %d", taskId, r);
// add a node task
if (taskId < max) {
function_array.splice(function_array.length-1, 0, nodeTask);
}
callback(null, taskId + 1, value + r);
};
var startTask = function(callback) {
function_array.splice(function_array.length-1, 0, nodeTask);
callback(null, 1, 0);
};
var finalTask = function(taskId, value, callback) {
callback(null, value);
};
var function_array = [startTask, finalTask];
async.waterfall(function_array, function (err, result) {
console.log("Sum is ", result);
});

small nodejs process is eating 113m memory

I am trying to save data from an external API into a mongodb using nodejs. The script feels really lightweight to me, but for some reason it's using a lot of RAM (from top):
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
2626 root 20 0 756m 113m 7148 S 6.5 11.4 3:11.74 nodejs
This is what the script does in psuedo code:
each 5 seconds
fetch 3 JSON lists through an API
for all new items in list
store in mongo
[edit]
The JSON lists are aprox. 10kb each. So I don't think it has to do with keeping that in memory until I processed the items.
[/edit]
the (light) dependencies are:
querystring
https
underscore
mongodb (the native client)
moment
I wrote it as simple functions, when returning they should give back all the memory they used right?
Here is the whole script:
var querystring = require("querystring");
var https = require('https');
var fetch = function(cur, callback) {
cur = cur.toLowerCase().replace('/', '_');
var options = {
host: 'data.fxbtc.com',
path: '/api?op=query_last_trades&count=100&symbol=' + cur,
method: 'GET',
headers: {
'User-Agent': 'Mozilla/4.0 (compatible; node.js client)'
}
};
var req = https.request(options, function(res) {
res.setEncoding('utf8');
var buffer = '';
res.on('data', function(data) {
buffer += data;
});
res.on('end', function() {
try {
var json = JSON.parse(buffer);
} catch (err) {
return callback(err);
}
callback(null, json);
});
});
req.end();
}
var currencies = [
'BTC/CNY',
'LTC/CNY',
'LTC/BTC'
];
var LAST_TRADE = {
'BTC/CNY': 0,
'LTC/CNY': 0,
'LTC/BTC': 0
}
var _ = require('underscore');
var mongo = require('mongodb');
var moment = require('moment');
var init = function(next) {
mongo.connect('mongodb://127.0.0.1:27017/coindata', next);
}
var now = function() {
return moment().format('YYYY-MM-DD HH:mm:ss');
}
console.log(now(), 'STARTING');
setInterval(function() {
console.log(now(), 'alive')
}, 60000)
var collections = {};
var forever = function(err, db) {
if(err) throw err;
_.each(currencies, function(cur, i) {
collections[cur] = db.collection('fxbtc_' + cur);
collections[cur].ensureIndex({fid: 1}, {unique: true}, console.log);
setTimeout(function() {
console.log(now(), 'registering', cur);
setInterval(check(cur), 5 * 1000);
}, i * 1000);
});
}
var check = function(cur) {
return function() {
fetch(cur, function(err, trades) {
if(err) return console.log(now(), 'ERROR-FETCH', err);
trades = _.map(trades.datas, function(trade) {
return {
date: new Date(trade.date * 1000),
price: parseFloat(trade.rate),
amount: parseFloat(trade.vol),
fid: parseInt(trade.ticket)
}
});
trades = _.filter(trades, function(trade) {
return trade.fid > LAST_TRADE[cur];
});
var fids = _.pluck(trades, 'fid');
fids.push(LAST_TRADE[cur]);
LAST_TRADE[cur] = _.max(fids);
if(!trades.length)
return;
console.log(now(), 'storing:', trades.length, 'in', cur);
collections[cur].insert(trades, function(err, docs) {
if(err && err.code !== 11000) console.log(now(), 'ERROR-STORE', err);
});
});
}
}
init(forever);
Are there any obvious memory leaks in this script? How do I go about finding the source of all the used memory?
The project I am working on is polling a lot of different API services (15+) and storing all latest changes.
My initial thought was to write a small script for each different service, which has a loop inside that should stay up forever. The problem (as described above) is that somehow the memory would grow to 40 - 120mb (depending on a couple of things) per service and my system would run out of RAM.
This is how I solved it now:
instead of keeping a process per service alive I rewrote all scripts to run only once and wrote a master script that is responsible for running each script per service script after x amount of time:
var cp = require('child_process');
var moment = require('moment');
var i = 0;
var watch = function(options) {
i++;
setTimeout(function() {
var fid = 0;
setInterval(function() {
var worker = cp.fork('./process_' + options.exchange + '.js');
worker.send(fid);
worker.once('message', function(new_fid) {
fid = new_fid;
worker.kill();
});
}, options.interval);
}, i * 3000);
}
And then I register all different services like so:
watch({exchange: 'bitcurex', interval: +moment.duration(9, 'minutes')});
It has been running for a little over 10 hours or now with little to no memory footprint (I can't find it in top).

Resources