We do read an XML file (using xml-stream) with about 500k elements and do insert them into MongoDB like this:
xml.on(`endElement: product`, writeDataToDb.bind(this, "product"));
Insert in writeDataToDb(type, obj) looks like this:
collection.insertOne(obj, {w: 1, wtimeout: 15000}).catch((e) => { });
Now when the Mongo connection gets disconnected, the xml stream still reads and the console gets flooded with error messages (can't insert, disconnected, EPIPE broken, ...).
In the docs it says:
When you shut down the mongod process, the driver stops processing operations and keeps buffering them due to bufferMaxEntries being -1 by default meaning buffer all operations.
What does this buffer actually do?
We notice when we insert data and close the mongo server, the things get buffered, then we bring the mongo server back up, the native driver successfully reconnects and node resumes inserting data but the buffered documents (during mongo beeing offline) do not get inserted again.
So I question this buffer and its use.
We are looking for the best way to keep inserts in buffer until mongo comes back (in 15000milliseconds according to wtimeout) and let then insert the buffered documents or make use of xml.pause(); and xml.resume() which we tried without success.
Basically we need a little help in how to handle disconnects without data loss or interrupts.

Inserting 500K elements with insertOne() is a very bad idea. You should instead use bulk operations that allows you to insert many document in a single request.
(here for example 10000, so it can be done in 50 single requests)
To avoid buffering issue, you can manually handle it:
Disable buffering with bufferMaxEntries: 0
Set reconnect properties: reconnectTries: 30, reconnectInterval: 1000
Create a bulkOperation and feed it with 10000 items
Pause the xml reader. Try to insert the 10000 items. If it fails, retry every 3000ms until it succeed
You may face some duplicate ID issues if the bulk operation is interrupted during execution, so ignore them (error code: 11000)
here is a sample script :
var fs = require('fs')
var Xml = require('xml-stream')
var MongoClient = require('mongodb').MongoClient
var url = 'mongodb://localhost:27017/test'
MongoClient.connect(url, {
reconnectTries: 30,
reconnectInterval: 1000,
bufferMaxEntries: 0
}, function (err, db) {
if (err != null) {
console.log('connect error: ' + err)
} else {
var collection = db.collection('product')
var bulk = collection.initializeUnorderedBulkOp()
var totalSize = 500001
var size = 0
var fileStream = fs.createReadStream('data.xml')
var xml = new Xml(fileStream)
xml.on('endElement: product', function (product) {
// if we have enough product, save them using bulk insert
if (size % 10000 == 0) {
bulk.execute(function (err, result) {
if (err == null) {
bulk = collection.initializeUnorderedBulkOp()
console.log('doc ' + (size - 10000) + ' : ' + size + ' saved on first try')
} else {
console.log('bulk insert failed: ' + err)
counter = 0
var retryInsert = setInterval(function () {
bulk.execute(function (err, result) {
if (err == null) {
bulk = collection.initializeUnorderedBulkOp()
console.log('doc ' + (size - 10000) + ' : ' + size + ' saved after ' + counter + ' tries')
} else if (err.code === 11000) { // ignore duplicate ID error
bulk = collection.initializeUnorderedBulkOp()
console.log('doc ' + (size - 10000) + ' : ' + size + ' saved after ' + counter + ' tries')
} else {
console.log('failed after first try: ' + counter, 'error: ' + err)
}, 3000) // retry every 3000ms until success
} else if (size === totalSize) {
bulk.execute(function (err, result) {
if (err == null) {
} else {
console.log('bulk insert failed: ' + err)
sample log output:
doc 0 : 10000 saved on first try
doc 10000 : 20000 saved on first try
doc 20000 : 30000 saved on first try
bulk insert failed: MongoError: interrupted at shutdown // mongodb server shutdown
failed after first try: 1 error: MongoError: no connection available for operation and number of stored operation > 0
failed after first try: 2 error: MongoError: no connection available for operation and number of stored operation > 0
failed after first try: 3 error: MongoError: no connection available for operation and number of stored operation > 0
doc 130000 : 140000 saved after 4 tries
doc 140000 : 150000 saved on first try

I don't know specifically about Mongodb driver and this buffer of entries. Maybe it only keeps data in specific scenarios.
So I will answer to this question with a more general approach that can work with any database.
To summarize, you have two problems:
You are not recovering from failed attempts
XML stream send data too fast
To handle the first issue, you need to implement a retry algorithm that will ensure that many attempts are made before giving up.
To handle the second issue, you need to implement back pressure on the xml stream. You can do that using the pause method, the resume method and an input buffer.
var Promise = require('bluebird');
var fs = require('fs');
var Xml = require('xml-stream');
var fileStream = fs.createReadStream('myFile.xml');
var xml = new Xml(fileStream);
// simple exponential retry algorithm based on promises
function exponentialRetry(task, initialDelay, maxDelay, maxRetry) {
var delay = initialDelay;
var retry = 0;
var closure = function() {
return task().catch(function(error) {
if (retry > maxRetry) {
throw error
var promise = Promise.delay(delay).then(closure);
delay = Math.min(delay * 2, maxDelay);
return promise;
return closure();
var maxPressure = 100;
var currentPressure = 0;
var suspended = false;
var stopped = false;
var buffer = [];
// handle back pressure by storing incoming tasks in the buffer
// pause the xml stream as soon as we have enough tasks to work on
// resume it when the buffer is empty
function writeXmlDataWithBackPressure(product) {
// closure used to try to start a task
var tryStartTask = function() {
// if we have enough tasks running, pause the xml stream
if (!stopped && !suspended && currentPressure >= maxPressure) {
suspended = true;
console.log("stream paused");
// if we have room to run tasks
if (currentPressure < maxPressure) {
// if we have a buffered task, start it
// if not, resume the xml stream
if (buffer.length > 0) {
} else if (!stopped) {
try {
suspended = false;
console.log("stream resumed");
} catch (e) {
// the only way to know if you've reached the end of the stream
// xml.on('end') can be triggered BEFORE all handlers are called
// probably a bug of xml-stream
stopped = true;
console.log("stream end");
// push the task to the buffer
buffer.push(function() {
// use exponential retry to ensure we will try this operation 100 times before giving up
exponentialRetry(function() {
return writeDataToDb(product)
}, 100, 2000, 100).finally(function() {
// a task has just finished, let's try to run a new one
// we've just buffered a task, let's try to run it
// write the product to database here :)
function writeDataToDb(product) {
// the following code is here to create random delays and random failures (just for testing)
var timeToWrite = Math.random() * 100;
var failure = Math.random() > 0.5;
return Promise.delay(timeToWrite).then(function() {
if (failure) {
throw new Error();
return null;
xml.on('endElement: product', writeXmlDataWithBackPressure);
Play with it, put some console.log to understand how it behaves.
I hope this will help you to solve your issue :)


Nodejs Mongodb second sort() for first sort() results

format want to appear -
msg - 9
msg - 17
msg - 18
msg - 19
From this function
mongo.connect(mongourl, function (err, db) {
var collection = db.collection('chat')
var stream = collection.find().sort({ _id : -1 }).limit(10).stream();
stream.on('data', function (chatt) {clients[].emit('chat message', chatt.content); });
But this sorting only give me -
msg - 19
msg - 18
msg - 17
msg - 9
Because it is a chat app and latest rows needed to be at bottom for chat format.
And I tried to add new sorting code to result (I am new to node) - and it resorts the whole db again and give earliest 10 rows.
mongo.connect(mongourl, function (err, db) {
var collection = db.collection('chat')
var stream = collection.find().sort({ _id : -1 }).limit(10);
stream=stream.sort({ _id : 1 }).stream();
stream.on('data', function (chatt) {clients[].emit('chat message', chatt.content); });
How can I reverse the result from sort|limit ? Thank you so much.
There are at least two ways to do this. FAIR WARNING: I didn't test this on an ACTUAL MongoDB Connection. But, this should get the principals across. May the force be with you on your journey with Node!
Using just an array:
mongo.connect(mongourl, function (err, db) {
var collection = db.collection('chat');
// Get the previous 10 initially.
var prev10 = collection.find().sort({ _id : -1 }).limit(10).toArray();
// flush it
let msg = null;
while (msg = prev10.pop()) {
// Remove the last element, which is the first message sent, altering the array in place.
// Send it to the socket
client[].emit('chat_message', msg.content);
Using an array for the initial 10, then stream: THIS IS A LITTLE KLUDGEY because we're attempting to skip what the initial array fetch returned. I'd rather not do this, but hey why not give it a shot?
mongo.connect(mongourl, function (err, db) {
var collection = db.collection('chat');
// Get the previous 10 initially.
var prev10 = collection.find().sort({ _id : -1 }).limit(10).toArray();
// flush it
let msg = null;
let count = 0;
while (msg = prev10.pop()) {
// Remove the last element, which is the first message sent, altering the array in place.
// Send it to the socket
client[].emit('chat_message', msg.content);
// Increment
count ++;
// Set up your stream, skipping the message already retrieved
stream=collection.find({}).sort({ _id : -1 }).skip(count).stream();
stream.on('data', function (chatt) {
// Stream each incoming message individually.
clients[].emit('chat message', chatt.content);
PREFERRED, if you JUST want to use streams and not bother with the initial query: Here, you can return 10, 20, 100 recent historic messages and it will always adjust to your needs. The delay always ensures there aren't new records to gather before returning in order of appearance, FILO (First-In, Last-Out). MongoDB is essentially making your FIFO a FILO because of the structure of it's results sort. So you're transforming the order by popping the array it returns.
mongo.connect(mongourl, function (err, db) {
var collection = db.collection('chat');
let DATADELAY = 10;
let filoQue = [];
let delayedFlush = null;
// Set up the stream.
var stream = collection.find().sort({ _id : -1 }).limit(HISTORYCOUNT).stream();
// Read data.
stream.on('data', function (chatt) {
// New data show up quickly, so it won't flush...
if (delayedFlush) { clearTimeout(delayedflush); }
// ...but rather fill the FILO queue.
delayedFlush = setTimeout(function() {
// Empty the first 10 in your preferred order 9 -> 19
if (filoQue.length > 0) {
let msg = null;
while (msg = filoQue.pop();) {
// Always remove the last element, which will be the first message sent, altering the array in place.
// Send it to the socket
client[].emit('chat_message', msg.content);
Again, fair warning that I didn't test this on an ACTUAL MongoDB connection. Just tested to make sure the queueing would work for your intents. Let me know if there are bugs and I can edit to help the next.

Best way to migrate GB's of data from MongoDB to Cassandra using nodejs script

I have a big collection in MongoDB. Want to migrate all data by running some business logic nodejs scripts on that data to cassandra. What is the best way to do this ?
I have made a script in which i am getting 5000 documents in a single request from mongo and processing the data and inserting the documents into cassandra. It takes a lot of time after 40-50 iterations. CPU usage shows 100%. is this because of a lot of callbacks happening ? I am new to node js so not able to conclude anything.` var cassandra = require('../models/tracking_cassandra');
var TrackingEvents = require('../models/tracking_mongo_events');
var counter = 0;
var incr = 5000;
var final_counter = 0;
var start_point = function (callback){
TrackingEvents.count(function(err, data){
final_counter = data;
TrackingEvents.getEventsByCounter(counter, function(counter, obj) {
var prevId = obj[0].toObject()._id;
getMessagesFromMongo(prevId, callback);
function getMessagesFromMongo(prevId, callback){
counter = counter + incr;
TrackingEvents.getEventsByCounter(counter, function(counter, obj) {
var nextId = obj[0].toObject()._id;
var start_time = new Date();
TrackingEvents.getEventsBtwIds(prevId, nextId, function ( err, userEvents ) {
if(userEvents.length !== 0){
insert_into_cassandra( userEvents, callback );
console.log('empty data set');
if(counter >= final_counter){
getMessagesFromMongo(nextId, callback);
var insert_into_cassandra = function( events, callback ){
var inserts = 0;
total_documents = total_documents + events.length;
for(var i = 0 ; i< events.length ; i++){
var userEventData = events[i].toObject();
if(typeof userEventData.uid == 'undefined'){
total_nuid ++;
create_cassandra_query( userEventData );
var create_cassandra_query = function ( eventData ) {
delete eventData._id;
delete eventData[0];
delete eventData.appid;
delete eventData.appversion;
var query = "INSERT INTO userwise_events ";
var keys = "(";
var values = "(";
for(var key in eventData){
if(eventData[key] == null || typeof eventData[key] == 'undefined'){
delete eventData[key];
if (eventData.hasOwnProperty(key)) {
keys = keys + key + ', ';
values = values + ':' + key + ', ';
if(key != 'uid' && key!= 'date_time' && key != 'etypeId'){
eventData[key] = String(eventData[key]);
keys = keys.slice(0, -2);
values = values.slice(0, -2);
keys = keys + ")";
values = values + ")";
query = query + keys + " VALUES " + values;
cassandra.trackingCassandraClient.execute(query, eventData, { prepare: true }, function (err, data) {
var start_time = new Date();
start_point(function(res, err){
var end_time = new Date();
var diff = end_time.getTime() - start_time.getTime();
var seconds_diff = diff / 1000;
var totalSec = Math.abs(seconds_diff);
console.log('Total Execution Time : ' + totalSec);
process.on('uncaughtException', function (err) {
console.log('Caught exception: ' + err);
is this because of a lot of callbacks happening ?
There may be no callbacks at all for all I know - it's impossible to tell you what's the problem with your code of which you didn't include even a single line of code.
For such a vague question I can only give you a general advice: make sure you don't have long running for or while loops. And don't ever use a blocking system call anywhere else than on the first tick of the event loop. If you don't know what is the first tick of the event loop then don't use blocking calls at all. Whenever you can, use streams for data - especially if you have lots of it.
A 100% CPU utilization is a bad sign and should never happen for I/O-heavy operation like the one that you are trying to perform. You should easily be able to handle insane amounts of data, especially when you use streams. Having your process max out the CPU for an inherently I/O-bound operation like moving large amounts of data through a network is a sure sign that you're doing something wrong in your code. What exactly is that? That will remain a mystery since you didn't show us even a single line of your code.

nodejs using mongoose error: FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - process out of memory

I have a nodejs program simply just copy a field from a collection to another collection. I wrote two of it. one copies field naming(string), another copies ids(array of string). the collection is not large, roughly only 900 forms to be iterated. I can see it runs and saved some of the form, but I don't understand why this error occurs as the program continues running:
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - process out of memory
Here is the program:
var mongoose = require('mongoose'),
config = require('../modules/system/node-js/parseConfig'),
schemas = require('../modules/system/node-js/schemas.js'),
cde_schemas = require('../modules/cde/node-js/schemas'),
form_schemas = require('../modules/form/node-js/schemas'),
mongo_cde = require('../modules/cde/node-js/mongo-cde'),
async = require('async');
var mongoUrl = config.mongoUri;
var conn = mongoose.createConnection(mongoUrl);
var DataElement = conn.model('DataElement', cde_schemas.dataElementSchema);
var Form = conn.model('Form', form_schemas.formSchema);
var formCounter = 0;
archived: null
}).exec(function (err, forms) {
if (err) {
async.eachSeries(forms, function (form, doneOneForm) {
console.log("begin " + formCounter + " form id: " + form.tinyId);
var questionCount = 0;
var areYouDone = function () {
if (questionCount === 0) { (err) {
if (err)
else {
console.log('saved form id: ' + form.tinyId);
var formElements = form.formElements;
var getQuestions = function (formElements) {
formElements.forEach(function (fe) {
if (fe.elementType === 'question') {
var cdeTinyId = fe.question.cde.tinyId;
var version = fe.question.cde.version;
DataElement.findOne({tinyId: cdeTinyId, version: version}).exec(function (err, cde) {
if (err) {
console.log('found cde id: ' + cdeTinyId + ' version: ' + version);
if (cde && cde.ids) fe.question.cde.ids = cde.ids;
//if I run this program with this comment instead of above, there is no error, but error happens on the ids which is array of string.
// = cde.naming[0].designation;
else {
console.log("no CDE with id: " + cdeTinyId)
else {
}, function doneAllForms() {
console.log('finished all forms, # form: ' + formCounter);
Without seeing any output from your logging statements, my guess is that you're getting into some kind of infinite recursion. The likely culprit in the code you've shown thus far is the getQuestions(fe.formElements) line.
Either the formElements property refers to itself (or refers to another element in a similar way that creates a circular reference) and possibly the first value is such that fe.elementType !== 'question' so it just keeps calling the function over and over again and none of the forEach()s ever complete.
I suppose a similar thing could also happen if there is no circular references but the link from one set of formElements to the next is long enough to cause problems and causes getQuestions() to be executed at least once for each forEach().
You may want to start with a smaller collection of forms and/or verify that your fe.elementType values and formElements links/references are what they should be.

Node, anything to prevent flooding?

How can I prevent someone from simply doing
while(true){client.emit('i am spammer', true)};
This sure proves to be a problem when someone has the urge to crash my node server!
Like tsrurzl said you need to implement a rate limiter (throttling sockets).
Following code example only works reliably if your socket returns a Buffer (instead of a string). The code example assumes that you will first call addRatingEntry(), and then call evalRating() immediately afterwards. Otherwise you risk a memory leak in the case where evalRating() doesn't get called at all or too late.
var rating, limit, interval;
rating = []; // rating: [*{'timestamp', 'size'}]
limit = 1048576; // limit: maximum number of bytes/characters.
interval = 1000; // interval: interval in milliseconds.
// Describes a rate limit of 1mb/s
function addRatingEntry (size) {
// Returns entry object.
return rating[(rating.push({
'size': size
}) - 1);
function evalRating () {
// Removes outdated entries, computes combined size, and compares with limit variable.
// Returns true if you're connection is NOT flooding, returns false if you need to disconnect.
var i, newRating, totalSize;
// totalSize in bytes in case of underlying Buffer value, in number of characters for strings. Actual byte size in case of strings might be variable => not reliable.
newRating = [];
for (i = rating.length - 1; i >= 0; i -= 1) {
if (( - rating[i].timestamp) < interval) {
rating = newRating;
totalSize = 0;
for (i = newRating.length - 1; i >= 0; i -= 1) {
totalSize += newRating[i].timestamp;
return (totalSize > limit ? false : true);
// Assume connection variable already exists and has a readable stream interface
connection.on('data', function (chunk) {
if (evalRating()) {
// Continue processing chunk.
} else {
// Disconnect due to flooding.
You can add extra checks, like checking whether or not the size parameter really is a number etc.
Addendum: Make sure the rating, limit and interval variables are enclosed (in a closure) per connection, and that they don't define a global rate (where each connection manipulates the same rating).
I implemented a little flood function, not perfect (see improvements below) but it will disconnect a user when he does to much request.
// Not more then 100 request in 10 seconds
let FLOOD_TIME = 10000;
let FLOOD_MAX = 100;
let flood = {
floods: {},
lastFloodClear: new Date(),
protect: (io, socket) => {
// Reset flood protection
if( Math.abs( new Date() - flood.lastFloodClear) > FLOOD_TIME ){
flood.floods = {};
flood.lastFloodClear = new Date();
flood.floods[] == undefined ? flood.floods[] = {} : flood.floods[];
flood.floods[].count == undefined ? flood.floods[].count = 0 : flood.floods[].count;
//Disconnect the socket if he went over FLOOD_MAX in FLOOD_TIME
if( flood.floods[].count > FLOOD_MAX){
console.log('FLOODPROTECTION ',
return false;
return true;
exports = module.exports = flood;
And then use it like this:
let flood = require('../modules/flood')
// ... init socket io...
socket.on('message', function () {
if(flood.protect(io, socket)){
//do stuff
Improvements would be, to add another value next to the count, how often he got disconneted and then create a banlist and dont let him connect anymore. Also when a user refreshes the page he gets a new so maybe use here a unique cookie value instead of the
Here is simple rate-limiter-flexible package example.
const app = require('http').createServer();
const io = require('')(app);
const { RateLimiterMemory } = require('rate-limiter-flexible');
const rateLimiter = new RateLimiterMemory(
points: 5, // 5 points
duration: 1, // per second
io.on('connection', (socket) => {
socket.on('bcast', async (data) => {
try {
await rateLimiter.consume(socket.handshake.address); // consume 1 point per event from IP
socket.emit('news', { 'data': data });
socket.broadcast.emit('news', { 'data': data });
} catch(rejRes) {
// no available points to consume
// emit error or warning message
socket.emit('blocked', { 'retry-ms': rejRes.msBeforeNext });
Read more in official docs

Balancing slow I/O in a fast stream read stream

In node.js I have a read stream that I wish to reformat and write to a database. As the read stream is fast and the write is slow the node.js queue could be overwhelmed as the queue of writes builds up (assume the stream is gb's of data). How do I force the read to wait for the write part of the code so this does not happen without blocking ?
var request = http.get({
host: '',
port: 80,
path: '/children?' + qs.stringify({
geonameId: geonameId,
username: "demo"
}).on('response', function(response) {
var xml = new XmlStream(response, 'utf8');
xml.on('endElement: geoname ', function(input) {
var output = new Object();
output.Name =; =;
output.lng = input.lng;
output._key = input.geonameId;
data.db.document.create(output, data.doc, function(callback){
//this is really slow.
// i do not want to return from here and receive more data until the 'create' above has completed
I just ran into this problem last night, and in my hackathon induced sleep deprived state, here is how I solved it:
I would increment a counter whenever I sent a job out to be processed, and decremented the counter when the operation completed. To keep the outbound traffic from overwhelming the other service, I would pause the stream when there was a certain number of pending outbound requests. The code is very similar to the following.
var instream = fs.createReadStream('./combined.csv');
var outstream = new stream;
var inProcess = 0;
var paused = false;
var rl = readline.createInterface(instream, outstream);
rl.on('line', function(line) {
if(inProcess > 100) {
console.log('pausing input to clear queue');
paused = true;
someService.doSomethingSlow(line, function() {
if(paused && inProcess < 10) {
console.log('resuming stream');
paused = false;
if (err) throw err;
rl.on('end', function() {
Not the most elegant solution, but it worked and allowed me to process the million+ lines without running out of memory or throttling the other service.
My solution simply extends an empty stream.Writable and is fundamentally identical to #Timothy's, but uses events and
doesn't rely on Streams1 .pause() and .resume() (which didn't seem to be having any effect on my data pipeline,
var stream = require("stream");
var liveRequests = 0;
var maxLiveRequests = 100;
var streamPaused = false;
var requestClient = new stream.Writable();
function requestCompleted(){
if(streamPaused && liveRequests < maxLiveRequests){
streamPaused = false;
requestClient._write = function (data, enc, next){
makeRequest(data, requestCompleted);
if(liveRequests >= maxLiveRequests){
streamPaused = true;
requestClient.once("resumeStream", function resume(){
else {
A counter, liveRequests, keeps track of the number of concurrent requests, and is incremented whenever
makeRequest() is called and decremented when it completes (ie, when requestCompleted()) is called. If a request has
just been made and liveRequests exceeds maxLiveRequests, we pause the stream with streamPaused. If a request
completes, the stream is paused, and liveRequests is now less than maxLiveRequests, we can resume the stream. Since
subsequent data items are read by _write() when its next() callback is called, we can simply defer the latter with
an event-listener on our custom "resumeStream" event, which mimics pausing/resuming.
Now, simply readStream.pipe(requestClient).
Edit: I abstracted this solution, along with automatic batching of input data, in a package.
