node.js process out of memory in http.request loop

node.js process out of memory in http.request loop - node.js

In my node.js server i cant figure out, why it runs out of memory. My node.js server makes a remote http request for each http request it receives, therefore i've tried to replicate the problem with the below sample script, that also runs out of memory.
This only happens if the iterations in the for loop are very high.
From my point of view, the problem is related to the fact that node.js is queueing the remote http requests. How to avoid this?
This is the sample script:
(function() {
var http, i, mypost, post_data;
http = require('http');
post_data = 'signature=XXX%7CPSFA%7Cxxxxx_value%7CMyclass%7CMysubclass%7CMxxxxx&schedule=schedule_name_6569&company=XXXX';
mypost = function(post_data, cb) {
var post_options, req;
post_options = {
host: 'myhost.com',
port: 8000,
path: '/set_xxxx',
method: 'POST',
headers: {
'Content-Length': post_data.length
}
};
req = http.request(post_options, function(res) {
var res_data;
res.setEncoding('utf-8');
res_data = '';
res.on('data', function(chunk) {
return res_data += chunk;
});
return res.on('end', function() {
return cb();
});
});
req.on('error', function(e) {
return console.debug('TM problem with request: ' + e.message);
});
req.write(post_data);
return req.end;
};
for (i = 1; i <= 1000000; i++) {
mypost(post_data, function() {});
}
}).call(this);
$ node -v
v0.4.9
$ node sample.js
FATAL ERROR: CALL_AND_RETRY_2 Allocation failed - process out of memory
Tks in advance
gulden PT

Constraining the flow of requests into the server
It's possible to prevent overload of the built-in Server and its HTTP/HTTPS variants by setting the maxConnections property on the instance. Setting this property will cause node to stop accept()ing connections and force the operating system to drop requests when the listen() backlog is full and the application is already handling maxConnections requests.
Throttling outgoing requests
Sometimes, it's necessary to throttle outgoing requests, as in the example script from the question.
Using node directly or using a generic pool
As the question demonstrates, unchecked use of the node network subsystem directly can result in out of memory errors. Something like node-pool makes the active pool management attractive, but it doesn't solve the fundamental problem of unconstrained queuing. The reason for this is that node-pool doesn't provide any feedback about the state of the client pool.
UPDATE: As of v1.0.7 node-pool includes a patch inspired by this post to add a boolean return value to acquire(). The code in the following section is no longer necessary and the example with the streams pattern is working code with node-pool.
Cracking open the abstraction
As demonstrated by Andrey Sidorov, a solution can be reached by tracking the queue size explicitly and mingling the queuing code with the requesting code:
var useExplicitThrottling = function () {
var active = 0
var remaining = 10
var queueRequests = function () {
while(active < 2 && --remaining >= 0) {
active++;
pool.acquire(function (err, client) {
if (err) {
console.log("Error acquiring from pool")
if (--active < 2) queueRequests()
return
}
console.log("Handling request with client " + client)
setTimeout(function () {
pool.release(client)
if(--active < 2) {
queueRequests()
}
}, 1000)
})
}
}
queueRequests(10)
console.log("Finished!")
}
Borrowing the streams pattern
The streams pattern is a solution which is idiomatic in node. Streams have a write operation which returns false when the stream cannot buffer more data. The same pattern can be applied to a pool object with acquire() returning false when the maximum number of clients have been acquired. A drain event is emitted when the number of active clients drops below the maximum. The pool abstraction is closed again and it's possible to omit explicit references to the pool size.
var useStreams = function () {
var queueRequests = function (remaining) {
var full = false
pool.once('drain', function() {
if (remaining) queueRequests(remaining)
})
while(!full && --remaining >= 0) {
console.log("Sending request...")
full = !pool.acquire(function (err, client) {
if (err) {
console.log("Error acquiring from pool")
return
}
console.log("Handling request with client " + client)
setTimeout(pool.release, 1000, client)
})
}
}
queueRequests(10)
console.log("Finished!")
}
Fibers
An alternative solution can be obtained by providing a blocking abstraction on top of the queue. The fibers module exposes coroutines that are implemented in C++. By using fibers, it's possible to block an execution context without blocking the node event loop. While I find this approach to be quite elegant, it is often overlooked in the node community because of a curious aversion to all things synchronous-looking. Notice that, excluding the callcc utility, the actual loop logic is wonderfully concise.
/* This is the call-with-current-continuation found in Scheme and other
* Lisps. It captures the current call context and passes a callback to
* resume it as an argument to the function. Here, I've modified it to fit
* JavaScript and node.js paradigms by making it a method on Function
* objects and using function (err, result) style callbacks.
*/
Function.prototype.callcc = function(context /* args... */) {
var that = this,
caller = Fiber.current,
fiber = Fiber(function () {
that.apply(context, Array.prototype.slice.call(arguments, 1).concat(
function (err, result) {
if (err)
caller.throwInto(err)
else
caller.run(result)
}
))
})
process.nextTick(fiber.run.bind(fiber))
return Fiber.yield()
}
var useFibers = function () {
var remaining = 10
while(--remaining >= 0) {
console.log("Sending request...")
try {
client = pool.acquire.callcc(this)
console.log("Handling request with client " + client);
setTimeout(pool.release, 1000, client)
} catch (x) {
console.log("Error acquiring from pool")
}
}
console.log("Finished!")
}
Conclusion
There are a number of correct ways to approach the problem. However, for library authors or applications that require a single pool to be shared in many contexts it is best to properly encapsulate the pool. Doing so helps prevent errors and produces cleaner, more modular code. Preventing unconstrained queuing then becomes an evented dance or a coroutine pattern. I hope this answer dispels a lot of FUD and confusion around blocking-style code and asynchronous behavior and encourages you to write code which makes you happy.

yes, you trying to queue 1000000 requests before even starting them. This version keeps limited number of request (100):
function do_1000000_req( cb )
{
num_active = 0;
num_finished = 0;
num_sheduled = 0;
function shedule()
{
while (num_active < 100 && num_sheduled < 1000000) {
num_active++;
num_sheduled++;
mypost(function() {
num_active--;
num_finished++;
if (num_finished == 1000000)
{
cb();
return;
} else if (num_sheduled < 1000000)
shedule();
});
}
}
}
do_1000000_req( function() {
console.log('done!');
});

the node-pool module can help you. For more détails, see this post (in french), http://blog.touv.fr/2011/08/http-request-loop-in-nodejs.html

Related

How to run asynchronous tasks synchronous?

I'm developing an app with the following node.js stack: Express/Socket.IO + React. In React I have DataTables, wherein you can search and with every keystroke the data gets dynamically updated! :)
I use Socket.IO for data-fetching, so on every keystroke the client socket emits some parameters and the server calls then the callback to return data. This works like a charm, but it is not garanteed that the returned data comes back in the same order as the client sent it.
To simulate: So when I type in 'a', the server responds with this same 'a' and so for every character.
I found the async module for node.js and tried to use the queue to return tasks in the same order it received it. For simplicity I delayed the second incoming task with setTimeout to simulate a slow performing database-query:
Declaration:
const async = require('async');
var queue = async.queue(function(task, callback) {
if(task.count == 1) {
setTimeout(function() {
callback();
}, 3000);
} else {
callback();
}
}, 10);
Usage:
socket.on('result', function(data, fn) {
var filter = data.filter;
if(filter.length === 1) { // TEST SYNCHRONOUSLY
queue.push({name: filter, count: 1}, function(err) {
fn(filter);
// console.log('finished processing slow');
});
} else {
// add some items to the queue
queue.push({name: filter, count: filter.length}, function(err) {
fn(data.filter);
// console.log('finished processing fast');
});
}
});
But the way I receive it in the client console, when I search for abc is as follows:
ab -> abc -> a(after 3 sec)
I want it to return it like this: a(after 3sec) -> ab -> abc
My thought is that the queue runs the setTimeout and then goes further and eventually the setTimeout gets fired somewhere on the event loop later on. This resulting in returning later search filters earlier then the slow performing one.
How can i solve this problem?

First a few comments, which might help clear up your understanding of async calls:
Using "timeout" to try and align async calls is a bad idea, that is not the idea about async calls. You will never know how long an async call will take, so you can never set the appropriate timeout.
I believe you are misunderstanding the usage of queue from async library you described. The documentation for the queue can be found here.
Copy pasting the documentation in here, in-case things are changed or down:
Creates a queue object with the specified concurrency. Tasks added to the queue are processed in parallel (up to the concurrency limit). If all workers are in progress, the task is queued until one becomes available. Once a worker completes a task, that task's callback is called.
The above means that the queue can simply be used to priorities the async task a given worker can perform. The different async tasks can still be finished at different times.
Potential solutions
There are a few solutions to your problem, depending on your requirements.
You can only send one async call at a time and wait for the first one to finish before sending the next one
You store the results and only display the results to the user when all calls have finished
You disregard all calls except for the latest async call
In your case I would pick solution 3 as your are searching for something. Why would you use care about the results for "a" if they are already searching for "abc" before they get the response for "a"?
This can be done by giving each request a timestamp and then sort based on the timestamp taking the latest.

SOLUTION:
Server:
exports = module.exports = function(io){
io.sockets.on('connection', function (socket) {
socket.on('result', function(data, fn) {
var filter = data.filter;
var counter = data.counter;
if(filter.length === 1 || filter.length === 5) { // TEST SYNCHRONOUSLY
setTimeout(function() {
fn({ filter: filter, counter: counter}); // return to client
}, 3000);
} else {
fn({ filter: filter, counter: counter}); // return to client
}
});
});
}
Client:
export class FilterableDataTable extends Component {
constructor(props) {
super();
this.state = {
endpoint: "http://localhost:3001",
filters: {},
counter: 0
};
this.onLazyLoad = this.onLazyLoad.bind(this);
}
onLazyLoad(event) {
var offset = event.first;
if(offset === null) {
offset = 0;
}
var filter = ''; // filter is the search character
if(event.filters.result2 != undefined) {
filter = event.filters.result2.value;
}
var returnedData = null;
this.state.counter++;
this.socket.emit('result', {
offset: offset,
limit: 20,
filter: filter,
counter: this.state.counter
}, function(data) {
returnedData = data;
console.log(returnedData);
if(returnedData.counter === this.state.counter) {
console.log('DATA: ' + JSON.stringify(returnedData));
}
}
This however does send unneeded data to the client, which in return ignores it. Somebody any idea's for further optimizing this kind of communication? For example a method to keep old data at the server and only send the latest?

How to wait in node.js for a variable available on the cloud to have a specific value

I'm sorry if this is a basic question, but I am trying to implement a program in node.js that should wait for the value of a variable available trough a request to a cloud api (photon.variable()) to be 1. This variable should not be requested more than once per second. My first attempt is included in the sample code below. Despite knowing it does not work at all, I think it could be useful to show the functionality I would like to implement.
var photondata = 0;
while (photondata < 1)
{
setTimeout(function () {
photon.variable("witok", function(err, data) {
if (!err) {
console.log("data: ", data.result);
photondata = data.result;
}
else console.log(err);
})}, 1000);
}

Since you couldn't do async stuff in loops before, the traditional approach would be to create a function that adds itself to setTimeout for as long as needed, then calls some other function when it's done. You still need to do this in the browser if not using Babel.
These days, you can stop execution and wait for things to happen when using a generator function (which latest versions of Node now support). There are many libraries that will let you do this and I will advertise ours :)
CL.run(function* () {
var photondata = 0;
while (true) {
yield CL.try(function* () {
var data = yield photon.variable("witok", CL.cb());
console.log("data: ", data.result);
photondata = data.result;
}, function* (err) {
console.log(err.message);
});
if (photondata >= 1) break;
yield CL.sleep(1000);
}
// do whatever you need here
});

Express.js - while loop before sending response

I'm trying to implement and existing solution in node.js, specifically, using express.js framework. Now, the existing solution works as follows:
server exposes a GET service that clients can connect to
when a client calls the GET service, the client number increments (a global variable) and then the number of clients is checked;
if there are not at least 3 clients connected, the service is in endless loop, waiting for other clients to connect
if (or rather, when) the rest of the two clients connect, the service sends respond to everyone that enough clients are connected (a 'true' value).
So what basically happens is, the client connects and the connection is active (in a loop) until enough clients connect, then and only then there is a response (to all clients at the same time).
Now I'm not expert in these architectures, but from what I think, this is not a correct or good solution. My initial thought was: this must be solved with sockets. However, since the existing solution works like that (it's not written in node.js), I tried to emulate such behaviour:
var number = (function(){
var count = 0;
return {
increase: function() {
count++;
},
get: function(){
return count;
}
};
})();
app.get('/test', function(req, res){
number.increase();
while (number.get() < 3) {
//hold it here, until enough clients connect
}
res.json(number.get());
});
Now while I think that this is not a correct solution, I have a couple of questions:
Is there any alternative to solving this issue, besides using sockets?
Why does this "logic" work in C#, but not in express.js? The code above hangs, no other request is processed.
I know node.js is single-threaded, but what if we have a more conventional service that responds immediately, and there are 20 requests all at the same time?

I would probably use an event emitter for this:
var EventEmitter = require('events').EventEmitter;
var emitter = new EventEmitter();
app.get('/', function(req, res) {
// Increase the number
number.increase();
// Get the current value
var current = number.get();
// If it's less than 3, wait for the event emitter to trigger.
if (current < 3) {
return emitter.once('got3', function() {
return res.json(number.get());
});
}
// If it's exactly 3, emit the event so we wake up other listeners.
if (current === 3) {
emitter.emit('got3');
}
// Fall through.
return res.json(current);
});
I would like to stress that #Plato is correct in stating that browsers may timeout when a response takes too much time to complete.
EDIT: as an aside, some explanation on the return emitter.once(...).
The code above can be rewritten like so:
if (current < 3) {
emitter.once('got3', function() {
res.json(number.get());
});
} else if (current === 3) {
emitter.emit('got3');
res.json(number.get());
} else {
res.json(number.get());
}
But instead of using those if/else statements, I return from the request handler after creating the event listener. Since request handlers are asynchronous, their return value is discarded, so you can return anything (or nothing). As an alternative, I could also have used this:
if (current < 3) {
emitter.once(...);
return;
}
if (current === 3) {
...etc...
Also, even though you return from the request handler function, the event listener is still referencing the res variable, so the request handler scope is maintained by Node until res.json() in the event listener callback is called.

Your http approach should work
You are blocking the event loop so node refuses to do any other work while it is in the while loop
You're really close, you just need to check every now and then instead of constantly. I do this below with process.nextTick() but setTimeout() would also work:
var number = (function(){
var count = 0;
return {
increase: function() {
count++;
},
get: function(){
return count;
}
};
})();
function waitFor3(callback){
var n = number.get();
if(n < 3){
setImmediate(function(){
waitFor3(callback)
})
} else {
callback(n)
}
}
function bump(){
number.increase();
console.log('waiting');
waitFor3(function(){
console.log('done');
})
}
setInterval(bump, 2000);
/*
app.get('/test', function(req, res){
number.increase();
waitFor3(function(){
res.json(number.get());
})
});
*/

Throttle CPU NODE.JS action to allow new calls to be processed

I have an expressJS application that accepts a request that results in 1K to 50K fs.link() actions being executed. (it might even hit 500K).
The request (a POST) is not being held up while this occurs. I immediately fire of a res.send() which makes the client happy.
But the server then "forks" the job below, which needs to go and do all the fs.links() which do happen async, but the amount of work (CPU, DISK etc.) means that the ExpressJS service is not very responsive to new requests during this time.
Is there some easy way (other than childProcess) to simulate the forking of a low priority thread that would be doing these file linking?
Job.prototype.runJob = function (next) {
var self = this;
var max = this.files.length;
var count = 0;
async.each(this.files,
function (file, step) {
var src = path.join(self.sourcePath, file.path);
var base = path.basename(src);
var dest = path.join(self.root, base);
fs.link(src, dest, function (err) {
if (err) {
// logger.addLog('warn', "fs.link failed for file: %s", err.message, { file: src });
self.filesMissingList.push(src);
self.errors = true;
self.filesMissing++;
} else {
self.filesFound++;
}
self.batch.update({ tilesCount: ++count, tilesMax: max, done: false });
step(null);
});
},
function (err) {
self.batch.update({ tilesCount: count, tilesMax: max, done: true });
next(null, "FalconView Linking of: " + self.type + " run completed");
});
}

You could use the webworker-threads module, which is good for spinning CPU-intensive tasks onto other threads. Alternatively, you could abuse cluster, but it's really the wrong tool for the job. (The cluster module is really better for scaling up web services, not for doing intensive tasks.)

You can try to Use async.eachLimit instead of async.each. This way you can control how many iterations you process before an expressJS process.

Bytes sent/received for Node.js HTTP request

Once an HTTP request has been served, I would like to log the number of bytes sent/received.
A simple source for this data is req.connection.bytesRead/.bytesWritten. However, this is problematic for HTTP 1.1 keep-alive connections, as the same socket can be used for multiple requests. I need to log per-request, not per-connection.
The solution must lie on the HTTP side of things, but I see no methods documented for getting the data I need.
What is the proper way to calculate bytes read/written for HTTP requests served by Node.js's http.Server?

Unfortunately, I never found a proper way to do this. I've resorted some fairly terrible duck punching, but it works for my particular use case. In case anyone else stumbles along with this problem, you can start with this and refine from there.
Module #1: "Extra Events"
All this module does is make the response object emit a finishBeforeSocketDestroy event. Since I needed this event in a few places in my application, I effectively made a separate module just for this duck punch. app.use() it before Module #2.
module.exports = function (req, res, next) {
var end = res.end;
res.end = function () {
res.end = end;
res.emit('finishBeforeSocketDestroy');
res.end.apply(this, arguments);
}
next();
}
Module #2: "Stats"
This module creates a req.stats object, containing all sorts of useful goodies for tracking bandwidth usage during usage of the connection, and after it is finished.
var pollTime = 1000;
module.exports = function (req, res, next) {
var pollInterval;
function pollStats () {
if (typeof req.stats._lastMeasuredTime === 'object') {
var secondsSinceLastMeasurement = ((new Date() - req.stats._lastMeasuredTime) / 1000);
req.stats.averageRate = {
read: (req.socket.bytesRead - req.stats.bytesRead) / secondsSinceLastMeasurement,
write: (req.socket.bytesWritten - req.stats.bytesWritten) / secondsSinceLastMeasurement
};
}
req.stats._lastMeasuredTime = new Date();
req.stats.bytesRead = req.socket.bytesRead;
req.stats.bytesWritten = req.socket.bytesWritten;
}
req.stats = {
startTime: new Date(),
endTime: null,
averageRate: {read: null, write: null},
bytesRead: req.socket.bytesRead,
bytesWritten: req.socket.bytesWritten,
_lastMeasuredTime: new Date()
};
pollInterval = setInterval(pollStats, pollTime);
res.on('finishBeforeSocketDestroy', function () {
clearInterval(pollInterval);
pollStats();
req.stats.endTime = new Date();
});
next();
}
Like I said... messy. I'm only posting it as duck punching may be your only option. Also beware that socket may get re-used for multiple HTTP requests, which could cause you to double-count some bytes if you're not careful.

Just store traffic value after each response and calculate difference in 'finish' or 'end' handler:
// server.onRequest:
...
req._prevBytesWritten = 0;
// response.onFinish/onEnd:
...
responseLen = req.socket.bytesWritten - req._prevBytesWritten;
req._prevBytesWritten = req.socket.bytesWritten;

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string