I am trying to save data from an external API into a mongodb using nodejs. The script feels really lightweight to me, but for some reason it's using a lot of RAM (from top):
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
2626 root 20 0 756m 113m 7148 S 6.5 11.4 3:11.74 nodejs
This is what the script does in psuedo code:
each 5 seconds
fetch 3 JSON lists through an API
for all new items in list
store in mongo
[edit]
The JSON lists are aprox. 10kb each. So I don't think it has to do with keeping that in memory until I processed the items.
[/edit]
the (light) dependencies are:
querystring
https
underscore
mongodb (the native client)
moment
I wrote it as simple functions, when returning they should give back all the memory they used right?
Here is the whole script:
var querystring = require("querystring");
var https = require('https');
var fetch = function(cur, callback) {
cur = cur.toLowerCase().replace('/', '_');
var options = {
host: 'data.fxbtc.com',
path: '/api?op=query_last_trades&count=100&symbol=' + cur,
method: 'GET',
headers: {
'User-Agent': 'Mozilla/4.0 (compatible; node.js client)'
}
};
var req = https.request(options, function(res) {
res.setEncoding('utf8');
var buffer = '';
res.on('data', function(data) {
buffer += data;
});
res.on('end', function() {
try {
var json = JSON.parse(buffer);
} catch (err) {
return callback(err);
}
callback(null, json);
});
});
req.end();
}
var currencies = [
'BTC/CNY',
'LTC/CNY',
'LTC/BTC'
];
var LAST_TRADE = {
'BTC/CNY': 0,
'LTC/CNY': 0,
'LTC/BTC': 0
}
var _ = require('underscore');
var mongo = require('mongodb');
var moment = require('moment');
var init = function(next) {
mongo.connect('mongodb://127.0.0.1:27017/coindata', next);
}
var now = function() {
return moment().format('YYYY-MM-DD HH:mm:ss');
}
console.log(now(), 'STARTING');
setInterval(function() {
console.log(now(), 'alive')
}, 60000)
var collections = {};
var forever = function(err, db) {
if(err) throw err;
_.each(currencies, function(cur, i) {
collections[cur] = db.collection('fxbtc_' + cur);
collections[cur].ensureIndex({fid: 1}, {unique: true}, console.log);
setTimeout(function() {
console.log(now(), 'registering', cur);
setInterval(check(cur), 5 * 1000);
}, i * 1000);
});
}
var check = function(cur) {
return function() {
fetch(cur, function(err, trades) {
if(err) return console.log(now(), 'ERROR-FETCH', err);
trades = _.map(trades.datas, function(trade) {
return {
date: new Date(trade.date * 1000),
price: parseFloat(trade.rate),
amount: parseFloat(trade.vol),
fid: parseInt(trade.ticket)
}
});
trades = _.filter(trades, function(trade) {
return trade.fid > LAST_TRADE[cur];
});
var fids = _.pluck(trades, 'fid');
fids.push(LAST_TRADE[cur]);
LAST_TRADE[cur] = _.max(fids);
if(!trades.length)
return;
console.log(now(), 'storing:', trades.length, 'in', cur);
collections[cur].insert(trades, function(err, docs) {
if(err && err.code !== 11000) console.log(now(), 'ERROR-STORE', err);
});
});
}
}
init(forever);
Are there any obvious memory leaks in this script? How do I go about finding the source of all the used memory?
The project I am working on is polling a lot of different API services (15+) and storing all latest changes.
My initial thought was to write a small script for each different service, which has a loop inside that should stay up forever. The problem (as described above) is that somehow the memory would grow to 40 - 120mb (depending on a couple of things) per service and my system would run out of RAM.
This is how I solved it now:
instead of keeping a process per service alive I rewrote all scripts to run only once and wrote a master script that is responsible for running each script per service script after x amount of time:
var cp = require('child_process');
var moment = require('moment');
var i = 0;
var watch = function(options) {
i++;
setTimeout(function() {
var fid = 0;
setInterval(function() {
var worker = cp.fork('./process_' + options.exchange + '.js');
worker.send(fid);
worker.once('message', function(new_fid) {
fid = new_fid;
worker.kill();
});
}, options.interval);
}, i * 3000);
}
And then I register all different services like so:
watch({exchange: 'bitcurex', interval: +moment.duration(9, 'minutes')});
It has been running for a little over 10 hours or now with little to no memory footprint (I can't find it in top).
Related
I am trying to launch a cluster that will stream files (new line delimited JSON) from google cloud storage and transform each row after fetching data from MongoDB. After transforming the row, i want to store it in Google's bigquery - 10000 rows at a time. All of this is working fine but the issue is that the rate at which the streamed files are being processed decreases significantly over time.
I have setup the node application on one server and mongodb on another. Both 8 core machines with 30GB RAM. When the script is executed, initially the CPU usage for the application server and mongodb server is around 70%-75%. After 30 minutes, the CPU usage falls down to 10% and then finally 1%. The application generates no exceptions. I can see the application log and find that it finished processing a few files and took up new files for processing. One execution can be observered below a little later than 3:00PM and a almost upto 5:20PM.
var cluster = require('cluster'),
os = require('os'),
numCPUs = os.cpus().length,
async = require('async'),
fs = require('fs'),
google = require('googleapis'),
bigqueryV2 = google.bigquery('v2'),
gcs = require('#google-cloud/storage')({
projectId: 'someproject',
keyFilename: __dirname + '/auth.json'
}),
dataset = bigquery.dataset('somedataset'),
bucket = gcs.bucket('somebucket.appspot.com'),
JSONStream = require('JSONStream'),
Transform = require('stream').Transform,
MongoClient = require('mongodb').MongoClient,
mongoUrl = 'mongodb://localhost:27017/bigquery',
mDb,
groupA,
groupB;
var rows = [],
rowsLen = 0;
function transformer() {
var t = new Transform({ objectMode: true });
t._transform = function(row, encoding, cb) {
// Get some information from mongodb and attach it to the row
if (row) {
groupA.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.OA_SA': 1, _id: 0 }
}, function(err, a) {
if (err) return cb();
groupB.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.WZ11CD': 1, _id: 0 }
}, function(err, b) {
if (err) return cb();
row.groupA = a ? a.properties.OA_SA : null;
row.groupB = b ? b.properties.WZ11CD : null;
// cache processed rows in memory
rows[rowsLen++] = { json: row };
if (rowsLen >= 10000) {
// batch insert rows in bigquery table
// and free memory
log('inserting 10000')
insertRowsAsStream(rows.splice(0, 10000));
rowsLen = rows.length;
}
cb();
});
});
} else {
cb();
}
};
return t;
}
var log = function(str) {
console.log(str);
}
function insertRowsAsStream(rows, callback) {
bigqueryV2.tabledata.insertAll({
"projectId": 'someproject',
"datasetId": 'somedataset',
"tableId": 'sometable',
"resource": {
"kind": "bigquery#tableDataInsertAllRequest",
"rows": rows
}
}, function(err, res) {
if (res && res.insertErrors && res.insertErrors.length) {
console.log(res.insertErrors[0].errors)
err = err || new Error(JSON.stringify(res.insertErrors));
}
});
}
function startStream(fileName, cb) {
// stream a file from Google cloud storage
var file = bucket.file(fileName),
called = false;
log(`Processing file ${fileName}`);
file.createReadStream()
.on('data', noop)
.on('end', function() {
if (!called) {
called = true;
cb();
}
})
.pipe(JSONStream.parse())
.pipe(transformer())
.on('finish', function() {
log('transformation ended');
if (!called) {
called = true;
cb();
}
});
}
function processFiles(files, cpuIdentifier) {
if (files.length == 0) return;
var fn = [];
for (var i = 0; i < files.length; i++) {
fn.push(function(cb) {
startStream(files.pop(), cb);
});
}
// process 3 files in parallel
async.parallelLimit(fn, 3, function() {
log(`child process ${cpuIdentifier} completed the task`);
fs.appendFile(__dirname + '/complete_count.txt', '1');
});
}
if (cluster.isMaster) {
for (var ii = 0; ii < numCPUs; ii++) {
cluster.fork();
}
} else {
MongoClient.connect(mongoUrl, function(err, db) {
if (err) throw (err);
mDb = db;
groupA = mDb.collection('groupageo');
groupB = mDb.collection('groupbgeo');
processFiles(files, process.pid);
// `files` is an array of file names
// each file is in newline json delimited format
// ["1478854974993/000000000000.json","1478854974993/000000000001.json","1478854974993/000000000002.json","1478854974993/000000000003.json","1478854974993/000000000004.json","1478854974993/000000000005.json"]
});
}
Okay, I have found the culprit! Google APIs Node.js client library makes use of a module called "stream-events" which implements Streams 0.8. Streams 0.8 do not control the rate at which it emits the 'data' event based on the consumer's ability to consume data. The rate controlling feature was introduced in Streams 1.0. So this essentially meant that the readable stream was throwing data at MongoDB at a rate that it was not able to process.
Solution:
I used the 'request' module instead of Google's client library. I supplied a signed URL to the request module which in turn fetched results as a stream which I could pipe into my transformer.
Take Away:
Always check the modules you use for the stream versions they are using.
I have the app below and it stalls (code below). And I have no idea why. I suspect I might be using the Bottleneck module the wrong way.
Disclaimer: I am trying to learn programming and NodeJS myself using this project. Please help.
Intro
The point of the app is to fetch data missing from documents in a DB by requesting a webpage and parsing it jQuery-style. Then saving the returned data to new keys in the document. The database consists of ~92 000 documents. The app uses the bottleneck, cheerio and request modules. I run the app on OS X.
The problem
If I set a limit to the number of requests, such as
var limiter = new bottleneck(5, 0);
The app stalls after the first batch (5 in this case). But why? I suspect something might be wrong with Bottleneck and how it expects my program to work. Something to do with callbacks per Bottleneck "Gotchas" maybe?
If I set no limit, the app kind-of-works. It fetches webpages and writes to the DB. However with a lot of errors due to resources being overloaded and thus slowly. This is how I tell bottleneck not to limit:
var limiter = new bottleneck(0, 0);
These are the kind of errors I get:
{ [Error: getaddrinfo ENOTFOUND www.vestnikverejnychzakazek.cz www.vestnikverejnychzakazek.cz:443]
code: 'ENOTFOUND',
errno: 'ENOTFOUND',
syscall: 'getaddrinfo',
hostname: 'www.vestnikverejnychzakazek.cz',
host: 'www.vestnikverejnychzakazek.cz',
port: 443 }
{ [Error: connect EMFILE 65.52.146.11:443 - Local (undefined:undefined)]
code: 'EMFILE',
errno: 'EMFILE',
syscall: 'connect',
address: '65.52.146.11',
port: 443 }
App code
'use strict';
var express = require('express');
var router = express.Router();
var assert = require('assert');
var mongo = require('mongoskin');
var path = require('path');
var ObjectID = require('mongodb').ObjectID;
var db = mongo.db("mongodb://localhost:27017/zak", {
native_parser: true
});
var database = db.collection("zakazky");
var cheerio = require("cheerio");
var request = require("request");
var fs = require("fs");
var toJs = (path.join(__dirname, '../public/javascripts', 'jquery.min.js'));
var jquery = fs.readFileSync(toJs).toString();
var bottleneck = require("bottleneck");
var limiter = new bottleneck(5, 0);
/* GET home page. */
router.get('/', function(req, res) {
var cursor = database.find();
cursor.each(function(err, data) {
assert.equal(err, null);
if (data != null) {
var vvz = "vestnikverejnychzakazek";
var praha = "zakazky.praha.eu";
var id = data["_id"];
var zdroj = data["zdroj"];
if (zdroj.indexOf(vvz) > -1) {
if ((data["cpv"] == null) || (data["predpokladana_hodnota"] == null)) {
limiter.submit(getCPV, id, zdroj, null);
// getCPV(id, zdroj);
} else {
// console.log("we're good");
return
}
} else if (zdroj.indexOf(praha) > -1) {
// console.log("pha");
}
} else {
// callback();
}
});
var getCPV = function(id, zdroj, callback) {
console.log("CPV started");
var zdroj = zdroj.replace("http://", "https://");
console.log("zdroj: " + zdroj);
var cpv = [];
var retryWrapper = function(retries) {
var retries; // I added this
if (retries === 3) {
return;
} else if (retries === undefined) {
retries = 0;
} else if (retries > 0) {
console.log("trying again");
}
request(zdroj, function(err, resp, data) {
if (err) {
console.log(err);
return retryWrapper(retries + 1);
}
var $ = cheerio.load(data);
var predpokladnaHodnota = $("[id*='Hodnota1_']").first().attr("value");
$("[id*='HlavniSlovnik']").each(function() {
cpv.push(this.attribs.value);
});
// let's check what we've got is actual data
if (cpv.length === 0) {
return
} else {
// send it off
writeCPV(id, "cpv", cpv)
}
if (predpokladnaHodnota == undefined || predpokladnaHodnota == null) {
return
} else {
// send it off
writeCPV(id, "predpokladana_hodnota", predpokladnaHodnota)
}
callback();
});
}; // end of retryWrapper
retryWrapper();
};
var writeCPV = function(id, key, value) {
id = ObjectID(id);
(function() {
console.log("starting DB write 1");
database.update({
"_id": id
}, {
$set: {
[key]: value
}
}, function(err, results) {
if (err) {
console.log("error in Mongo DB: \n------------------------\n" + err);
}
console.log("Mongo success!:\n ----------------------\n" + results);
// callback();
});
})();
};
// send the browser we're done
res.sendStatus(200);
});
// ---------------------
module.exports = router;
Here is a sample document from the DB including the fetched keys:
{
"_id": ObjectId("568d91396912101c1007ab4e"),
"cena": 1636363,
"cena_celkem": 1500000,
"cena_dopocitano": false,
"created": "2015-04-07T13:45:10.420739",
"datum_zadani": "2015-02-16",
"dodavatel": "/api/v1/dodavatel/381836/",
"druh_rizeni": "/api/v1/druh_rizeni/1116/",
"id": 1312587,
"modified": "2015-04-18T14:22:10.765733",
"nazev": "Pohostinství",
"pocet_nabidek": 2,
"podporeno_eu": true,
"popis": "Kurzy v oblasti pohostinství (formou profesní kvalifikace)",
"ramcova_smlouva": true,
"resource_uri": "/api/v1/zakazka/1312587/",
"skupina": "490648-ISVZUS_2011",
"typ_zakazky": "/api/v1/typ_zakazky/193/",
"zadavatel": "/api/v1/zadavatel/131528/",
"zdroj": "http://www.vestnikverejnychzakazek.cz/en/Form/Display/568547",
"zdroj_nazev": "isvzus.cz",
"cpv": ["80000000-4", "80400000-8", "", "", ""],
"predpokladana_hodnota": "1 500 000,00"
}
Sample URL being requested:
http://www.vestnikverejnychzakazek.cz/en/Form/Display/568547
This has been up here for a bit but in case anyone else stumbles upon this hopefully this will help someone!
The limiter is calling getCPV and it does call the callback at the end of the sequence, however, there are some conditional statements in retryWrapper that would allow an early return and as a result, never call the callback. The limiter will get piled up until it fires, so always make sure the callback will get fired in all of the scenarios.
I Have to Insert about 10,00000 documents in mongodb using nodejs.
I'm generating these documents using a for loop storing them into an array before finally inserting them into mongodb.
var codeArray = new Array();
for (var i = 0; i<1000000; i++){
var token = strNpm.generate();
var now = moment().format('YYYYMMDD hhmmss');
var doc1 = {id:token,
Discount_strId:"pending",
Promotion_strCode:token,
Promotion_strStatus:"I",
Promotion_dtmGeneratedDate:now,
User_strLogin:"test",
Promotion_strMode:"S",
Promotion_dtmValidFrom:"pending",
Promotion_dtmValidTill:"pending",
LastModified_dtmStamp:now
};
codeArray.push(doc1);
db.collection('ClPromoCodeMaster').insert(codeArray, function (err, result) {
if (err){
console.log(err);
}else{
console.log('Inserted Records - ', result.ops.length);
}
});
The problem I'm facing is mongo has an inserting limit of 16mb, so I can't insert the entire array at once.
Please suggest most optimum solutions.
The main problem is in the request size and not the document size, but it amounts to the same limitation. Bulk operations and the async library with async.whilst will handle this:
var bulk = db.collection('ClPromoCodeMaster').initializeOrderedBulkOp(),
i = 0;
async.whilst(
function() { return i < 1000000; },
function(callback) {
var token = strNpm.generate();
var now = moment().format('YYYYMMDD hhmmss');
var doc = {
id:token,
Discount_strId:"pending",
Promotion_strCode:token,
Promotion_strStatus:"I",
Promotion_dtmGeneratedDate:now,
User_strLogin:"test",
Promotion_strMode:"S",
Promotion_dtmValidFrom:"pending",
Promotion_dtmValidTill:"pending",
LastModified_dtmStamp:now
};
bulk.insert(doc);
i++;
// Drain every 1000
if ( i % 1000 == 0 ) {
bulk.execute(function(err,response){
bulk = db.collection('ClPromoCodeMaster').initializeOrderedBulkOp();
callback(err);
});
} else {
callback();
}
},
function(err) {
if (err) throw err;
console.log("done");
}
);
I should note that regardless there is an internal limit on bulk operations to 1000 operations per batch. You can submit in larger sizes, but the driver is just going to break these up and still submit in batches of 1000.
The 1000 is a good number to stay at though, since it is already in line with how the request will be handled, as well as being a reasonable number of things to hold in memory before draining the request queue and sending to the server.
For inserting millions of record at a time, Create node.js child process fork with MongoDb bulk api.
Child Process Creation:(index.js)
const {fork} = require("child_process");
let counter = 1;
function createProcess(data){
const worker = fork("./dbOperation");
worker.send(data);
worker.on("message", (msg) => {
console.log("Worker Message :",counter, msg);
counter++;
})
}
function bulkSaveUser(records) {
const singleBatchCount = 10000; // Save 10,000 records per hit
const noOfProcess = Math.ceil(records/singleBatchCount);
let data = {};
console.log("No of Process :", noOfProcess);
for(let index = 1; index <= noOfProcess; index++) {
data.startCount = (index == 1) ? index : (((index - 1) * singleBatchCount) + 1);
data.endCount = index * singleBatchCount;
createProcess(data);
}
}
bulkSaveUser(1500000);
DB Operation (dbOperation.js)
const MongoClient = require('mongodb').MongoClient;
// Collection Name
const collectionName = "";
// DB Connection String
const connString = "";
process.on("message", (msg) => {
console.log("Initialize Child Process", msg)
const {startCount, endCount} = msg;
inputStudents(startCount, endCount);
});
function initConnection() {
return new Promise(function(r, e) {
MongoClient.connect(connString, function(err, db) {
if (err) e(err)
r(db);
});
});
}
function inputStudents(startCount, endCount) {
let bulkData = [];
for(let index = startCount; index <= endCount; index++ ){
var types = ['exam', 'quiz', 'homework', 'homework'];
let scores = []
// and each class has 4 grades
for (j = 0; j < 4; j++) {
scores.push({'type':types[j],'score':Math.random()*100});
}
// there are 500 different classes that they can take
class_id = Math.floor(Math.random()*501); // get a class id between 0 and 500
record = {'student_id':index, 'scores':scores, 'class_id':class_id};
bulkData.push({ insertOne : { "document" : record } })
}
initConnection()
.then((db) => {
const studentDb = db.db("student");
const collection = studentDb.collection(colName)
console.log("Bulk Data :", bulkData.length);
collection.bulkWrite(bulkData, function(err, res) {
if (err) throw err;
//console.log("Connected Successfully",res);
process.send("Saved Successfully");
db.close();
});
})
.catch((err) => { console.log("Err :", err) });
}
Sample project to insert millions of record in mongodb using child process fork
Given Node's async nature it is difficult to time a series of web requests. How would I fire off 100 webrequests and figure out how long each individual request takes? Knowing the OS will only allow a few concurrent web request, how do I get the timeing for each individual webrequest, removing the time spent waiting for the other connections to complete. I was hoping the socket event was fired when the request launched but it seems that the socket event is fired after the connection has been established.
var http = require('http');
var urls = [
'/cameron',
'/sara',
'...',
// Time a url collection.
function timeUrl(url, calback) {
var options = {
host: 'www.examplesite.com',
port: 80,
path: ''
};
var times = [];
times.push({'text': 'start', 'time':Date.now()});
http.get(options, function(res) {
times.push({'text': 'response', 'time':Date.now()});
var result = '';
res.on('data', function(chunk) {
result += chunk.length ;
// result += chunk;
});
res.on('end', function() {
times.push({'text': 'end', 'time': Date.now(), 'body': result, 'statusCode': res.statusCode}); // ,
calback(times);
});
}).on('error', function(e) {
calback();
console.log("Got error: " + e.message);
times.push({'error':Date.now()});
}).on('socket', function (response) {
times.push({'text': 'socket', 'time':Date.now()});
});
}
for (var i = 0; i < urls.length; i++) {
var url = urls[i];
timeUrl(url, function(times) {
console.log(url);
for (var i = 0; i < times.length; i++) {
console.log(times[i].text, times[i].time - times[1].time , 'ms');
}
console.log('statusCode:', times[times.length -1].statusCode, 'Response Size:', times[times.length -1].body);
console.log('-');
});
}
If you're worried about OS concurrency just introduce maximum concurrency (throttling) into your requests instead of trying to guess when exactly the OS has started. I'm skipping over some minor details like error handling and using the excellent async.js library:
var http = require('http')
, async = require('async')
, CONCURRENCY = 5 // edit to fit your OS concurrency limit
, results = {}
, urls = [
'/cameron',
'/sara',
'/...'
];
// Time a url collection.
function timeUrl(url, callback) {
var options = { host: 'www.examplesite.com', port: 80 }
, start = Date.now()
, socket = null;
options.path = url;
http.get(options, function(res) {
var response = Date.now()
, size = 0;
res.on('data', function(chunk) { size += chunk.length; });
res.on('end', function() {
var end = Date.now();
results[url] = { start: start, socket: socket, response: response, end: end, size: size };
callback();
});
}).on('error', function(e) {
results[url] = { start: start, socket: socket, error: Date.now(), stack: e };
callback();
}).on('socket', function () {
socket = Date.now();
});
}
async.forEachLimit(urls, CONCURRENCY, timeUrl, function() {
console.log(JSON.stringify(results));
});
For ease of use, doing what you seem to want to do, I've not seen anything beat Nodetime.
I have problem with memory leak in nodejs when I added gzip support. I have wrote some code for tests. In code below I use zlib and I don't know why I have memory leak.
How I can improve this code to avoid memory leaking ? Anybody can help me ?
var
zlib = require('zlib'),
crypto = require('crypto');
var cacheList = {
article: {},
};
var timeoutId1, timeoutId2
console.log('process.pid: '+ process.pid);
clean = function()
{
var time = new Date().getTime();
timeoutId1 = setTimeout(function() { clean() }, 5000);
var countDeleted = 0;
for (id in cacheList.article) {
if (cacheList.article[id] && cacheList.article[id].timeExpire + 5000 < time) {
delete cacheList.article[id];
countDeleted++;
}
}
console.log('deleted: ' + countDeleted);
}
run = function()
{
var time = new Date().getTime();
timeoutId1 = setTimeout(function() { run() }, 5);
var md5 = crypto.createHash('md5');
md5.update('' + time);
var id = md5.digest('hex');
//console.log('id: ' + id);
var text = id+id+id+id+id+id+id+id +id+id+id+id+id+id+id+id +id+id+id+id+id+id+id+id +id+id+id+id+id+id+id+id +id+id+id+id+id+id+id+id +id+id+id+id+id+id+id+id +id+id+id+id+id+id+id+id +id+id+id+id+id+id+id+id;
zlib.gzip(text, function(err, result) {
if (!err) {
cacheList.article[id] = {
timeExpire: time + 10000,
data: text,
datagzip: result,
};
}
});
}
timeoutId1 = setTimeout(function() { run() }, 3000);
timeoutId2 = setTimeout(function() { clean() }, 5000);
change your two last lines to
timeoutId1 = run()
timeoutId2 = clean()
Since you already have the functions calling the setTimeout, by having another setTimeout, you are telling it to run another instance of that function (which in turn calls itself, so now you have it running twice, and so on).