I am running a node.js script on EC2 to monitor and run a node HTTP server as a child process.
Unfortunately this child server slows down slowly, requests that take 0.2 seconds start dragging out, after days the same requests take over 2 seconds.
As part of debugging this, I implemented a 2 hour restart to kill the child server and start another one. This has no effect! The HTTP server child process is restarted, but it is still slow! Only restarting this parent script makes the child faster.
Why is the HTTP server slowing down, even when killed and restarted?
Environment is 0.4.9 Node.js on EC2 Ubuntu server. Parent script is below.
var http = require('http');
var server,
firstOperated = null;
lastOperated = null;
function operating(str) {
return (str.toString().substring(0, 13) != 'SERVER ONLINE') ? log(str) :
lastOperated = new Date();
}
function log(str) {
str = str.toString('utf8');
if (str.length) console.log(str.replace(/\n+$/gim, ''));
}
function createServer() {
if (server) {
server.kill('SIGKILL');
return console.log('KILLED NON RESPONSIVE SERVER');
}
server = require('child_process').spawn('node', [__dirname + '/http.js', 80]);
firstOperated = new Date();
server.stdout.on('data', operating);
server.stderr.on('data', log);
server.on('exit', function(code) {
lastOperated = null;
server = null;
console.log("SERVER EXITED: " + code);
});
}
createServer();
setInterval(function() {
if (new Date() - firstOperated > 1000 * 60 * 60 * 2) return createServer();
if (new Date() - lastOperated < 5 * 1000) return; // server seems to be operating ok
createServer();
}, 5 * 1000);
If the EC2 instance is a Micro instance, and you are running at high cpu for more than about 15 seconds usage then you will be throttled (severely). This would explain the symptoms. The solution would be to scale up to a small instance. (It would not have to be the Node process that consumes the cpu cycles).
Related
I wrote up a simple load testing script that runs N number of hits to and HTTP endpoint over M async parallel lanes. Each lane waits for the previous request to finish before starting a new request. The script, for my specific use-case, is randomly picking a numeric "width" parameter to add to the URL each time. The endpoint returns between 200k and 900k of image data on each request depending on the width parameter. But my script does not care about this data and simply relies on garbage collection to clean it up.
const fetch = require('node-fetch');
const MIN_WIDTH = 200;
const MAX_WIDTH = 1600;
const loadTestUrl = `
http://load-testing-server.com/endpoint?width={width}
`.trim();
async function fetchAll(url) {
const res = await fetch(url, {
method: 'GET'
});
if (!res.ok) {
throw new Error(res.statusText);
}
}
async function doSingleRun(runs, id) {
const runStart = Date.now();
console.log(`(id = ${id}) - Running ${runs} times...`);
for (let i = 0; i < runs; i++) {
const start = Date.now();
const width = Math.floor(Math.random() * (MAX_WIDTH - MIN_WIDTH)) + MIN_WIDTH;
try {
const result = await fetchAll(loadTestUrl.replace('{width}', `${width}`));
const duration = Date.now() - start;
console.log(`(id = ${id}) - Width ${width} Success. ${i+1}/${runs}. Duration: ${duration}`)
} catch (e) {
const duration = Date.now() - start;
console.log(`(id = ${id}) - Width ${width} Error fetching. ${i+1}/${runs}. Duration: ${duration}`, e)
}
}
console.log(`(id = ${id}) - Finished run. Duration: ` + (Date.now() - runStart));
}
(async function () {
const RUNS = 200;
const parallelRuns = 10;
const promises = [];
const parallelRunStart = Date.now();
console.log(`Running ${parallelRuns} parallel runs`)
for (let i = 0; i < parallelRuns; i++) {
promises.push(doSingleRun(RUNS, i))
}
await Promise.all(promises);
console.log(`Finished parallel runs. Duration ${Date.now() - parallelRunStart}`)
})();
When I run this in Node 14.17.3 on my MacBook Pro running MacOS 10.15.7 (Catalina) with even a modest parallel lane number of 3, after about 120 (x 3) hits of the endpoint the following happens in succession:
Console output ceases in the terminal for the script, indicating the script has halted
Other applications such as my browser are unable to make network connections.
Within 1 - 2 mins other applications on my machine begin to slow down and eventually freeze up.
My entire system crashes with a kernel panic and the machine reboots.
panic(cpu 2 caller 0xffffff7f91ba1ad5): userspace watchdog timeout: remoted connection watchdog expired, no updates from remoted monitoring thread in 60 seconds, 30 checkins from thread since monitoring enabled 640 seconds ago after loadservice: com.apple.logd, total successful checkins since load (642 seconds ago): 64, last successful checkin: 10 seconds ago
service: com.apple.WindowServer, total successful checkins since load (610 seconds ago): 60, last successful checkin: 10 seconds ago
I can very easily stop of the progression of these symptoms by doing a Ctrl+C in the terminal of my script and force quitting it. Everything quickly gets back to normal. And I can repeat the experiment multiple times before allowing it to crash my machine.
I've monitored Activity Monitor during the progression and there is very little (~1%) CPU usage, memory usage reaches up to maybe 60-70mb, though it is pretty evident that the Network activity is peaking during the script's run.
In my search for others with this problem there were only two Stack Overflow articles that came close:
node.js hangs other programs on my mac
Node script causes system freeze when uploading a lot of files
Anyone have any idea why this would happen? It seems very dangerous that a single app/script could so easily bring down a machine without being killed first by the OS.
The problem I am facing is that the project has already programmed with cluster to distribute task.
if (cluster.isMaster) {
// Fork workers.
for (var i = 0; i < numCPUs; i++) {
cluster.fork();
}
cluster.on('exit', (worker, code, signal) => {
});
} else {
var server = http.createServer(app);
var usernames = {};
var showusernames = {};
var usersmessages = [];
require('../server/controllers/communication/chat.js').chatConfig(io, usernames);
/**
* Listen on provided port, on all network interfaces.
*/
server.listen(port);
server.on('error', onError);
server.on('listening', onListening);
}
I have a basic idea that this code is for distributing task among cpu and keeping server live in case one cpu fails.
The question is:
Everything was working fine till I need started working with node to schedule the cron job (which will be send email).
The cron now is ran by all workers simultaneously and the email is send to worker depending on how many cpu are there on server.
The I worker my way out by scheduling job as:
if(cluster.isWorker)
if(cluster.worker.id == 1){
cron.schedule('*/1 * * * *', function() {
//CRON JOB
})
}
This worked very fine within local system but failed in staging server maybe because of CPU aligned to this very project.
Is there any way to get only the first free worker and assign task to
him.
Now i tried this
var wokerArr = []
wokerArr.push(cluster.worker.id)
if(cluster.worker.id == wokerArr[0])
cron.schedule('*/1 * * * *', function() {
//CRON JOB
})
I did that using the crontab.
Making a separate cron file and schedule the job using crontab in command prompt to schedule job. Thanks for support.
You can schedule the cron in master process itself. Cron needs to be handled in an idempotent way.
if (cluster.isMaster) {
cron.schedule('*/1 * * * *', function() {
//CRON JOB
})
// continue initializing workers here
}
I am a newbie in Nodejs and I try something with cluster on nodejs. But I meet a problem:
- I use example on Nodejs API about Cluster:
var cluster = require('cluster');
var http = require('http');
var numCPUs = require('os').cpus().length;
if (cluster.isMaster) {
// Fork workers.
for (var i = 0; i < numCPUs; i++) {
cluster.fork();
}
cluster.on('exit', function(worker, code, signal) {
console.log('worker ' + worker.process.pid + ' died');
});
} else {
// Workers can share any TCP connection
// In this case its a HTTP server
http.createServer(function(req, res) {
res.writeHead(200);
res.end("hello world\n");
}).listen(8000);
}
I run above code, but when hit url into browser (localhost:8000), browser doesn't receive any response from nodejs (it's connecting... forever until I kill nodejs ##), I am,however, getting the "online" event to fire.
How can I get this server to respond to requests?
p/s: I try getting event "exit" to respawn new worker. Sometime when I hit enter on browser, console log worker x die and then respawn new worker. But browser still connecting...
http://i.stack.imgur.com/RHSYY.png
Help me :) and sorry my bad english
I install the newest version of nodejs ( v0.10.24)
I had a similar issue with cluster. Try putting the http server creation part inside a setTimeout call. (for example delay it for 1000 ms) you should observe an improvement.
setTimeout(function(){ http.createServer...... }, 1000);
Besides, you may also try to create your server using jxcore mt-keep to see if it works on the similar case.
$ jx mt-keep servercode.js
Currently, my prod environment for a side project is a git repo, where I pull in some code, manually kill the server with Ctrl-C, and restart it manually.
I realize there are a lot of things wrong with this. For instance, what if a user is still in the middle of doing something important and the process is crunching sensitive data, and I just killed it?!
When I used node v0.4.x there was a nice Cluster module that could restart the server gracefully, when the application is in a quiet state. In v0.6.x the Cluster module is built into node, but it's really, really bare, and doesn't have the graceful restart ability.
Anyone know how I can gracefully restart a nodejs server in v0.6.x?
You can handle POSIX signals in node code.
See in the example code, that will handle SIGINT (Ctrl-C for instance) as a STOP signal for all cluster workers, and SIGUSR2 will just restart all workers
So, issuing kill -SIGUSR2 PID, where PID is node master PID will restart all cluster
module.exports = function(app) {
var cluster = require('cluster');
var numCPUs = require('os').cpus().length;
var workerList = new Array();
var sigkill = false;
if (cluster.isMaster) {
for (var i = 0; i < numCPUs; i++) {
var env = process.env;
var worker = cluster.fork(env);
workerList.push(worker);
}
process.on('SIGUSR2',function(){
console.log("Received SIGUSR2 from system");
console.log("There are " + workerList.length + " workers running");
workerList.forEach(function(worker){
console.log("Sending STOP message to worker PID=" + worker.pid);
worker.send({cmd: "stop"});
});
});
process.on('SIGINT',function(){
sigkill = true;
process.exit();
});
cluster.on('death', function(worker) {
if (sigkill) {
logger.warn("SIGKINT received - not respawning workers");
return;
}
var newWorker = cluster.fork();
console.log('Worker ' + worker.pid + ' died and it will be re-spawned');
removeWorkerFromListByPID(worker.pid);
workerList.push(newWorker);
});
} else {
process.on('message', function(msg) {
if (msg.cmd && msg.cmd == 'stop') {
console.log("Received STOP signal from master");
app.close();
process.exit();
}
});
app.listen(3000);
}
function removeWorkerFromListByPID(pid) {
var counter = -1;
workerList.forEach(function(worker){
++counter;
if (worker.pid === pid) {
workerList.splice(counter, 1);
}
});
}
}
There's a module named Forever.
That can gracefully restart the process. I suppose then you can somehow run several instances with cluster (one on each core) and use Forever to monitor / restart them.
This is just an option I found; I'm open to suggestions!
There's also a module named PM2. It has the ability to stop all processes in a cluster.
I m trying to implement a long polling strategy with node.js
What i want is when a request is made to node.js it will wait maximum 30 seconds for some data to become available. If there is data, it will output it and exit and if there is no data, it will just wait out 30 seconds max, and then exit.
here is the basic code logic i came up with -
var http = require('http');
var poll_function = function(req,res,counter)
{
if(counter > 30)
{
res.writeHeader(200,{'Content-Type':'text/html;charset=utf8'});
res.end('Output after 5 seconds!');
}
else
{
var rand = Math.random();
if(rand > 0.85)
{
res.writeHeader(200,{'Content-Type':'text/html;charset=utf8'});
res.end('Output done because rand: ' + rand + '! in counter: ' + counter);
}
}
setTimeout
(
function()
{
poll_function.apply(this,[req,res,counter+1]);
},
1000
);
};
http.createServer
(
function(req,res)
{
poll_function(req,res,1);
}
).listen(8088);
What i figure is, When a request is made the poll_function is called which calls itself after 1 second, via a setTimeout within itself. So, it should remain asynchronous means, it will not block other requests and will provide its output when its done.
I have used a Math.random() logic here to simulate data availability scenario at various interval.
Now, what i concern is -
1) Will there be any problem with it? - I simply don't wish to deploy it, without being sure it will not strike back!
2) Is it efficient? if not, any suggestion how can i improve it?
Thanks,
Anjan
All nodejs code is nonblocking as long as you don't get hunk in a tight CPU loop (like while(true)) or use a library that has blocking I/O. Putting a setTimeout at the end of a function doesn't make it any more parallel, it just defers some cpu work till a later event.
Here is a simple demo chat server that randomly emits "Hello World" every 0 to 60 seconds to and and all connection clients.
// A simple chat server using long-poll and timeout
var Http = require('http');
// Array of open callbacks listening for a result
var listeners = [];
Http.createServer(function (req, res) {
function onData(data) {
res.end(data);
}
listeners.push(onData);
// Set a timeout of 30 seconds
var timeout = setTimeout(function () {
// Remove our callback from the listeners array
listeners.splice(listeners.indexOf(onData), 1);
res.end("Timeout!");
}, 30000);
}).listen(8080);
console.log("Server listening on 8080");
function emitEvent(data) {
for (var i = 0; l = listeners.length; i < l; i++) {
listeners[i](data);
}
listeners.length = 0;
}
// Simulate random events
function randomEvents() {
emitData("Hello World");
setTimeout(RandomEvents, Math.random() * 60000);
}
setTimeout(RandomEvents, Math.random() * 60000);
This will be quite fast. The only dangerous part is the splice. Splice can be slow if the array gets very large. This can be made possibly more efficient by instead of closing the connection 30 seconds from when it started to closing all the handlers at once every 30 seconds or 30 seconds after the last event. But again, this is unlikely to be the bottleneck since each of those array items is backed by a real client connection that probably more expensive.