Our company is planning on transitioning from REDIS to Aerospike, but we are seeing some strange issues with missing get requests (only 35% making it back to the callback function).
Here is the code we are testing with:
var cluster = require('cluster');
var numCPUs = require('os').cpus().length;
if (cluster.isMaster)
{
for (var i = 0; i < numCPUs; i++)
{
var worker = cluster.fork();
}
}
else
{
var start = new Date().getTime();
var requests = 0;
var responses = 0;
var aerospike = require('./node_modules/aerospike');
var status = aerospike.status;
var client = aerospike.client({
hosts: [
{ addr: '127.0.0.1', port: 3000 }
]
});
function connect_cb( err, client) {
if (err.code == status.AEROSPIKE_OK) {
console.log("Aerospike Connection Success")
}
}
client.connect(connect_cb)
setInterval(function(){
for(var i=0; i<50; i++)
{
var key = aerospike.key('dexi','toys','floor_'+i);
requests++;
client.get(key, function(err, rec, meta) {
responses++;
if ( err.code == status.AEROSPIKE_OK )
{
}
else
{
console.error('Get Error:', err);
}
});
}
},10);
setInterval(function(){
for(var i=0; i<50; i++)
{
var key = aerospike.key('dexi','toys','floor_'+i);
var rec = {
uid: 1000, // integer data stored in bin called "uid"
name: "user_name", // string data stored in bin called "user_name"
dob: { mm: 12, dd: 29, yy: 1995}, // map data stored (msgpack format) in bin called "dob"
friends: [1001, 1002, 1003]
};
var metadata = {
ttl: 10000,
gen: 0
};
client.put(key, rec, metadata, function(err) {
switch ( err.code ) {
case status.AEROSPIKE_OK:
break;
default:
console.error("Put Error: " + err.message);
exitCode = 1;
break;
}
});
}
},10);
setInterval(function(){
var timeSpent = ( new Date().getTime()) - start;
console.log(requests, responses,timeSpent);
},15000);
}
Below is the console output we are seeing:
34400 9306 15098
34150 9250 15080
35050 9330 15087
34150 9235 15092
33250 9310 15120
33950 9249 15090
34650 9298 15101
35000 9400 15102
34700 9300 15166
33150 9399 15181
34500 9300 15193
33850 9292 15207
34400 9250 15162
34100 9360 15212
34050 9250 15171
34100 9348 15159
33800 9250 15118
34300 9309 15189
34050 9300 15152
34250 9405 15181
As you can see, on average, for every 35k get requests we send, we are only seeing a small % of them actually come back. Our Aerospike dashboard also reflects the discrepancy (only seeing 35% of the gets sent), as the throughput is reflecting the responses we are getting back.
Related
I am trying to launch a cluster that will stream files (new line delimited JSON) from google cloud storage and transform each row after fetching data from MongoDB. After transforming the row, i want to store it in Google's bigquery - 10000 rows at a time. All of this is working fine but the issue is that the rate at which the streamed files are being processed decreases significantly over time.
I have setup the node application on one server and mongodb on another. Both 8 core machines with 30GB RAM. When the script is executed, initially the CPU usage for the application server and mongodb server is around 70%-75%. After 30 minutes, the CPU usage falls down to 10% and then finally 1%. The application generates no exceptions. I can see the application log and find that it finished processing a few files and took up new files for processing. One execution can be observered below a little later than 3:00PM and a almost upto 5:20PM.
var cluster = require('cluster'),
os = require('os'),
numCPUs = os.cpus().length,
async = require('async'),
fs = require('fs'),
google = require('googleapis'),
bigqueryV2 = google.bigquery('v2'),
gcs = require('#google-cloud/storage')({
projectId: 'someproject',
keyFilename: __dirname + '/auth.json'
}),
dataset = bigquery.dataset('somedataset'),
bucket = gcs.bucket('somebucket.appspot.com'),
JSONStream = require('JSONStream'),
Transform = require('stream').Transform,
MongoClient = require('mongodb').MongoClient,
mongoUrl = 'mongodb://localhost:27017/bigquery',
mDb,
groupA,
groupB;
var rows = [],
rowsLen = 0;
function transformer() {
var t = new Transform({ objectMode: true });
t._transform = function(row, encoding, cb) {
// Get some information from mongodb and attach it to the row
if (row) {
groupA.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.OA_SA': 1, _id: 0 }
}, function(err, a) {
if (err) return cb();
groupB.findOne({
'geometry': { $geoIntersects: { $geometry: { type: 'Point', coordinates: [row.lon, row.lat] } } }
}, {
fields: { 'properties.WZ11CD': 1, _id: 0 }
}, function(err, b) {
if (err) return cb();
row.groupA = a ? a.properties.OA_SA : null;
row.groupB = b ? b.properties.WZ11CD : null;
// cache processed rows in memory
rows[rowsLen++] = { json: row };
if (rowsLen >= 10000) {
// batch insert rows in bigquery table
// and free memory
log('inserting 10000')
insertRowsAsStream(rows.splice(0, 10000));
rowsLen = rows.length;
}
cb();
});
});
} else {
cb();
}
};
return t;
}
var log = function(str) {
console.log(str);
}
function insertRowsAsStream(rows, callback) {
bigqueryV2.tabledata.insertAll({
"projectId": 'someproject',
"datasetId": 'somedataset',
"tableId": 'sometable',
"resource": {
"kind": "bigquery#tableDataInsertAllRequest",
"rows": rows
}
}, function(err, res) {
if (res && res.insertErrors && res.insertErrors.length) {
console.log(res.insertErrors[0].errors)
err = err || new Error(JSON.stringify(res.insertErrors));
}
});
}
function startStream(fileName, cb) {
// stream a file from Google cloud storage
var file = bucket.file(fileName),
called = false;
log(`Processing file ${fileName}`);
file.createReadStream()
.on('data', noop)
.on('end', function() {
if (!called) {
called = true;
cb();
}
})
.pipe(JSONStream.parse())
.pipe(transformer())
.on('finish', function() {
log('transformation ended');
if (!called) {
called = true;
cb();
}
});
}
function processFiles(files, cpuIdentifier) {
if (files.length == 0) return;
var fn = [];
for (var i = 0; i < files.length; i++) {
fn.push(function(cb) {
startStream(files.pop(), cb);
});
}
// process 3 files in parallel
async.parallelLimit(fn, 3, function() {
log(`child process ${cpuIdentifier} completed the task`);
fs.appendFile(__dirname + '/complete_count.txt', '1');
});
}
if (cluster.isMaster) {
for (var ii = 0; ii < numCPUs; ii++) {
cluster.fork();
}
} else {
MongoClient.connect(mongoUrl, function(err, db) {
if (err) throw (err);
mDb = db;
groupA = mDb.collection('groupageo');
groupB = mDb.collection('groupbgeo');
processFiles(files, process.pid);
// `files` is an array of file names
// each file is in newline json delimited format
// ["1478854974993/000000000000.json","1478854974993/000000000001.json","1478854974993/000000000002.json","1478854974993/000000000003.json","1478854974993/000000000004.json","1478854974993/000000000005.json"]
});
}
Okay, I have found the culprit! Google APIs Node.js client library makes use of a module called "stream-events" which implements Streams 0.8. Streams 0.8 do not control the rate at which it emits the 'data' event based on the consumer's ability to consume data. The rate controlling feature was introduced in Streams 1.0. So this essentially meant that the readable stream was throwing data at MongoDB at a rate that it was not able to process.
Solution:
I used the 'request' module instead of Google's client library. I supplied a signed URL to the request module which in turn fetched results as a stream which I could pipe into my transformer.
Take Away:
Always check the modules you use for the stream versions they are using.
For a school project i'm working a nodejs application.
I need to submit some locations of random pokemon. At this moment i'm using dummy data but i'm already getting errors.
I submit to a maximum of 10 locations to mongodb using mongoose but get the following error:
RangeError: Maximum call stack size exceeded
The data is valid to the mongoose Schema. The error is shown also when i submit just 1 location (as it own, not included in a array).
{ _id: 5702bcc7698a36c82833a943,
name: 'Pikachu',
pid: 0,
lng: 5.298302246195419,
lat: 51.68774273818562 }.
My code looks like this:
var startDate = new Date();
if (!req.query.lat || !req.query.lng){
res.status(400).json("Your location has not been included");
}
var lat = parseFloat(req.query.lat);
var lng = parseFloat(req.query.lng);
location.find({}).exec(function(err, locations){
if (err){ return next(err); }
var filteredlocations = [];
locations.forEach(function(fetchedLocation) {
distance = calcDistance(fetchedLocation.lat, fetchedLocation.lng, lat, lng);
if (distance < 1000){
filteredlocations.push(fetchedLocation);
}
});
if (filteredlocations.length < 10) {
var toAdd = 1 - filteredlocations.length;
var newlocations = [];
for (var i = 0; i < toAdd; i++){
var randomlocation = randomLocation(lat, lng);
var newlocation = new location({
"lat": randomlocation.lat,
"lng": randomlocation.lng,
"pid": 0,
"name": "Pikachu",
});
newlocations.push(newlocation);
}
console.log(newlocations[0]);
location.collection.insert(newlocations[0], onInsert);
function onInsert(err, docs){
if (err){
console.log("err: " + err);
} else {
filteredlocations.push(docs);
res.json(filteredlocations);
}
}
}
});
I Have to Insert about 10,00000 documents in mongodb using nodejs.
I'm generating these documents using a for loop storing them into an array before finally inserting them into mongodb.
var codeArray = new Array();
for (var i = 0; i<1000000; i++){
var token = strNpm.generate();
var now = moment().format('YYYYMMDD hhmmss');
var doc1 = {id:token,
Discount_strId:"pending",
Promotion_strCode:token,
Promotion_strStatus:"I",
Promotion_dtmGeneratedDate:now,
User_strLogin:"test",
Promotion_strMode:"S",
Promotion_dtmValidFrom:"pending",
Promotion_dtmValidTill:"pending",
LastModified_dtmStamp:now
};
codeArray.push(doc1);
db.collection('ClPromoCodeMaster').insert(codeArray, function (err, result) {
if (err){
console.log(err);
}else{
console.log('Inserted Records - ', result.ops.length);
}
});
The problem I'm facing is mongo has an inserting limit of 16mb, so I can't insert the entire array at once.
Please suggest most optimum solutions.
The main problem is in the request size and not the document size, but it amounts to the same limitation. Bulk operations and the async library with async.whilst will handle this:
var bulk = db.collection('ClPromoCodeMaster').initializeOrderedBulkOp(),
i = 0;
async.whilst(
function() { return i < 1000000; },
function(callback) {
var token = strNpm.generate();
var now = moment().format('YYYYMMDD hhmmss');
var doc = {
id:token,
Discount_strId:"pending",
Promotion_strCode:token,
Promotion_strStatus:"I",
Promotion_dtmGeneratedDate:now,
User_strLogin:"test",
Promotion_strMode:"S",
Promotion_dtmValidFrom:"pending",
Promotion_dtmValidTill:"pending",
LastModified_dtmStamp:now
};
bulk.insert(doc);
i++;
// Drain every 1000
if ( i % 1000 == 0 ) {
bulk.execute(function(err,response){
bulk = db.collection('ClPromoCodeMaster').initializeOrderedBulkOp();
callback(err);
});
} else {
callback();
}
},
function(err) {
if (err) throw err;
console.log("done");
}
);
I should note that regardless there is an internal limit on bulk operations to 1000 operations per batch. You can submit in larger sizes, but the driver is just going to break these up and still submit in batches of 1000.
The 1000 is a good number to stay at though, since it is already in line with how the request will be handled, as well as being a reasonable number of things to hold in memory before draining the request queue and sending to the server.
For inserting millions of record at a time, Create node.js child process fork with MongoDb bulk api.
Child Process Creation:(index.js)
const {fork} = require("child_process");
let counter = 1;
function createProcess(data){
const worker = fork("./dbOperation");
worker.send(data);
worker.on("message", (msg) => {
console.log("Worker Message :",counter, msg);
counter++;
})
}
function bulkSaveUser(records) {
const singleBatchCount = 10000; // Save 10,000 records per hit
const noOfProcess = Math.ceil(records/singleBatchCount);
let data = {};
console.log("No of Process :", noOfProcess);
for(let index = 1; index <= noOfProcess; index++) {
data.startCount = (index == 1) ? index : (((index - 1) * singleBatchCount) + 1);
data.endCount = index * singleBatchCount;
createProcess(data);
}
}
bulkSaveUser(1500000);
DB Operation (dbOperation.js)
const MongoClient = require('mongodb').MongoClient;
// Collection Name
const collectionName = "";
// DB Connection String
const connString = "";
process.on("message", (msg) => {
console.log("Initialize Child Process", msg)
const {startCount, endCount} = msg;
inputStudents(startCount, endCount);
});
function initConnection() {
return new Promise(function(r, e) {
MongoClient.connect(connString, function(err, db) {
if (err) e(err)
r(db);
});
});
}
function inputStudents(startCount, endCount) {
let bulkData = [];
for(let index = startCount; index <= endCount; index++ ){
var types = ['exam', 'quiz', 'homework', 'homework'];
let scores = []
// and each class has 4 grades
for (j = 0; j < 4; j++) {
scores.push({'type':types[j],'score':Math.random()*100});
}
// there are 500 different classes that they can take
class_id = Math.floor(Math.random()*501); // get a class id between 0 and 500
record = {'student_id':index, 'scores':scores, 'class_id':class_id};
bulkData.push({ insertOne : { "document" : record } })
}
initConnection()
.then((db) => {
const studentDb = db.db("student");
const collection = studentDb.collection(colName)
console.log("Bulk Data :", bulkData.length);
collection.bulkWrite(bulkData, function(err, res) {
if (err) throw err;
//console.log("Connected Successfully",res);
process.send("Saved Successfully");
db.close();
});
})
.catch((err) => { console.log("Err :", err) });
}
Sample project to insert millions of record in mongodb using child process fork
I use PUB/SUB ZeroMQ pattern.
System consists from Web Server ( Publisher ), clustered TCP servers ( Subscribers ) and external applications ( clients, which connect to TCP servers ).
Huge amount of external clients connect to every TCP server. Every external client has unique peerId which I use as topic in Publisher. For some management purposes I send messages to TCP servers ( like remove peer, change, etc. ). But also I need to send messages from TCP server to Web Server ( connect, disconnect, error ). I didn't find right way how to do it. Can anybody suggest how to do it correctly?
Update 1
It looks like using ROUTER/DEALER pattern is the most convenient for that.
Some comments about scripts.
External clients connect to tcp servers ( cluster ) and send unique peerId, on tcp server side tcp socket cached by unique peerId. Then tcp server sends peerId message by ZeroMQ socket to Web Server. Web Server caches envelope by peerId. Every n milliseconds Web Server sends messages to random peer ( generate 'peerId' ). TCP Server receives these messages, gets correct tcp socket from cache and sends theirs to clients. Clients calculate count of messages and every n milliseconds send their to TCP server, which sends count to WEB Server by ZeroMQ socket. On Web Server every n milliseconds count of sended and received messages are printed on console.
Test js script of server part:
var cluster = require('cluster'),
zmq = require('zmq'),
net = require('net'),
zmqport = 'tcp://127.0.0.1:12345';
var count = 10;
var countPeers = 10000;
var interval = 1;
if (cluster.isMaster) {
for (var i = 0; i < count; i++) cluster.fork({
TCP_SERVER: 1
});
cluster.fork({
WEB_SERVER: 1
});
cluster.on('death', function (worker) {
console.log('worker ' + worker.pid + ' died');
});
} else {
if (process.env.TCP_SERVER) {
var sockets = Object.create(null);
var socket = zmq.socket('dealer');
socket.identity = 'process-' + process.pid;
socket.connect(zmqport);
socket.on('message', function (peerIdBuffer) {
var peerId = peerIdBuffer.toString();
if (typeof sockets[peerId] !== 'undefined') {
var buffer = new Buffer(4);
buffer.writeUInt32BE(1, 0);
sockets[peerId].write(buffer);
}
});
var server = net.createServer(function (tcpsocket) {
tcpsocket.on('data', function (data) {
if (!tcpsocket.peerId) {
var peerId = data.toString();
sockets[peerId] = tcpsocket;
tcpsocket.peerId = peerId;
return socket.send(['id', data]);
}
return socket.send(['count', data]);
});
});
server.listen('13333', '0.0.0.0');
} else {
var countMessagesSended = 0;
var countMessagesReceived = 0;
var socket = zmq.socket('router');
var clients = Object.create(null);
socket.bind(zmqport, function (err) {
if (err) throw err;
setInterval(function () {
for (var i = 0; i < countPeers; i++) {
var topic = Math.floor(Math.random() * countPeers) + '-peer';
if (typeof clients[topic] !== 'undefined') {
countMessagesSended++;
socket.send([clients[topic], topic]);
}
}
}, interval);
});
socket.on('message', function (envelope, messageId, data) {
switch (messageId.toString()) {
case "id":
clients[data.toString()] = envelope.toString();
break;
case "count":
countMessagesReceived += data.readUInt32BE(0);
break;
}
});
setInterval(function () {
console.log('%s messages have been sended, %s - received', countMessagesSended, countMessagesReceived);
countMessagesSended = 0;
countMessagesReceived = 0;
}, 5000);
}
}
Test js script for clients:
var cluster = require('cluster'),
net = require('net');
var count = 10;
if (cluster.isMaster) {
for (var i = 0; i < count; i++) cluster.fork({
CLUSTER: i
});
cluster.on('death', function (worker) {
console.log('worker ' + worker.pid + ' died');
});
} else {
var clientspernode = 1000;
var offset = parseInt(process.env.CLUSTER, 10);
for (var j = (offset) * clientspernode; j < (offset + 1) * clientspernode; j++) {
(function (j) {
var countMessages = 0;
var client = net.connect({
port: 13333,
host: '127.0.0.1'
}, function () {
client.write(j + '-peer');
});
client.on('data', function (buffer) {
countMessages += Math.ceil(buffer.length / 8);
});
client.on('error', function () {
});
setInterval(function () {
var buf = new Buffer(4);
buf.writeUInt32BE(countMessages, 0);
client.write(buf);
countMessages = 0;
}, 5000);
})(j);
}
}
I am trying to save data from an external API into a mongodb using nodejs. The script feels really lightweight to me, but for some reason it's using a lot of RAM (from top):
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
2626 root 20 0 756m 113m 7148 S 6.5 11.4 3:11.74 nodejs
This is what the script does in psuedo code:
each 5 seconds
fetch 3 JSON lists through an API
for all new items in list
store in mongo
[edit]
The JSON lists are aprox. 10kb each. So I don't think it has to do with keeping that in memory until I processed the items.
[/edit]
the (light) dependencies are:
querystring
https
underscore
mongodb (the native client)
moment
I wrote it as simple functions, when returning they should give back all the memory they used right?
Here is the whole script:
var querystring = require("querystring");
var https = require('https');
var fetch = function(cur, callback) {
cur = cur.toLowerCase().replace('/', '_');
var options = {
host: 'data.fxbtc.com',
path: '/api?op=query_last_trades&count=100&symbol=' + cur,
method: 'GET',
headers: {
'User-Agent': 'Mozilla/4.0 (compatible; node.js client)'
}
};
var req = https.request(options, function(res) {
res.setEncoding('utf8');
var buffer = '';
res.on('data', function(data) {
buffer += data;
});
res.on('end', function() {
try {
var json = JSON.parse(buffer);
} catch (err) {
return callback(err);
}
callback(null, json);
});
});
req.end();
}
var currencies = [
'BTC/CNY',
'LTC/CNY',
'LTC/BTC'
];
var LAST_TRADE = {
'BTC/CNY': 0,
'LTC/CNY': 0,
'LTC/BTC': 0
}
var _ = require('underscore');
var mongo = require('mongodb');
var moment = require('moment');
var init = function(next) {
mongo.connect('mongodb://127.0.0.1:27017/coindata', next);
}
var now = function() {
return moment().format('YYYY-MM-DD HH:mm:ss');
}
console.log(now(), 'STARTING');
setInterval(function() {
console.log(now(), 'alive')
}, 60000)
var collections = {};
var forever = function(err, db) {
if(err) throw err;
_.each(currencies, function(cur, i) {
collections[cur] = db.collection('fxbtc_' + cur);
collections[cur].ensureIndex({fid: 1}, {unique: true}, console.log);
setTimeout(function() {
console.log(now(), 'registering', cur);
setInterval(check(cur), 5 * 1000);
}, i * 1000);
});
}
var check = function(cur) {
return function() {
fetch(cur, function(err, trades) {
if(err) return console.log(now(), 'ERROR-FETCH', err);
trades = _.map(trades.datas, function(trade) {
return {
date: new Date(trade.date * 1000),
price: parseFloat(trade.rate),
amount: parseFloat(trade.vol),
fid: parseInt(trade.ticket)
}
});
trades = _.filter(trades, function(trade) {
return trade.fid > LAST_TRADE[cur];
});
var fids = _.pluck(trades, 'fid');
fids.push(LAST_TRADE[cur]);
LAST_TRADE[cur] = _.max(fids);
if(!trades.length)
return;
console.log(now(), 'storing:', trades.length, 'in', cur);
collections[cur].insert(trades, function(err, docs) {
if(err && err.code !== 11000) console.log(now(), 'ERROR-STORE', err);
});
});
}
}
init(forever);
Are there any obvious memory leaks in this script? How do I go about finding the source of all the used memory?
The project I am working on is polling a lot of different API services (15+) and storing all latest changes.
My initial thought was to write a small script for each different service, which has a loop inside that should stay up forever. The problem (as described above) is that somehow the memory would grow to 40 - 120mb (depending on a couple of things) per service and my system would run out of RAM.
This is how I solved it now:
instead of keeping a process per service alive I rewrote all scripts to run only once and wrote a master script that is responsible for running each script per service script after x amount of time:
var cp = require('child_process');
var moment = require('moment');
var i = 0;
var watch = function(options) {
i++;
setTimeout(function() {
var fid = 0;
setInterval(function() {
var worker = cp.fork('./process_' + options.exchange + '.js');
worker.send(fid);
worker.once('message', function(new_fid) {
fid = new_fid;
worker.kill();
});
}, options.interval);
}, i * 3000);
}
And then I register all different services like so:
watch({exchange: 'bitcurex', interval: +moment.duration(9, 'minutes')});
It has been running for a little over 10 hours or now with little to no memory footprint (I can't find it in top).