I have a Grunt task and currently I am utilising AsyncJS to run it. AsyncJS worked well but still I feel like it can be more powerful if I can utilise NodeJS cluster to run it. I have checked out Grunt Parallel and Grunt Concurrent and it is not much different to what I doing in my Grunt task. Any suggestions on utilising NodeJS cluster module to speed up Task execution.
Currently I am doing like this
var queue = async.queue(task, function(task, cb){
// Process task with PhantomJS and then
cb();
}, require('os').cpus().length);
async.each(htmlPages, function(val, cb) {
queue.push(val, function() {
cb();
});
}, function() {
console.log('Completed');
done();
});
How can I make this work with NodeJS cluster?
One way to do it is to spawn the number of workers that you want using the cluster module. Then send messages to them when you want to start them working on something.
Below is code that initialises os.cpus().length workers and a queue that sends the work to them. It then pushes everything in htmlPages to that queue, waits for it to finish and then finally kills all the workers.
var os = require('os');
var async = require('async');
var cluster = require('cluster');
if (cluster.isWorker) {
process.on('message', function(msg) {
// Do the Phantom JS stuff
process.send(theResult);
});
}
if (cluster.isMaster) {
var workers = os.cpus().map(function () {
return cluster.fork();
});
var queue = async.queue(function (msg, cb) {
var worker = workers.pop();
worker.once('message', function (msg) {
workers.push(worker);
cb(null, msg);
});
worker.send(msg);
}, workers.length);
async.each(htmlPages, queue.push.bind(queue), function (err) {
if (err) { throw err; }
workers.forEach(function (worker) {
worker.kill();
});
console.log('Completed');
});
}
Related
I have a process that uses RabbitMQ and NodeJS to do image processing. Due to the intensive task, I think I have the same issue as the link here https://github.com/squaremo/amqp.node/issues/261
I am trying to figure out how to implement the last comment on that issue.
"Yep. NodeJS is single-threaded, and if you use that thread to do something for a long time, nothing else will happen. As #michaelklishin suggests, the known solution for this general problem is using a child process, or the cluster module."
EDIT:
I Updated the code below with a sample of how I think I can do this with the amqp-connection-manager module. Right now I use a global variable to hold the actual message to be able to ack. I am guessing there is a better way to do this.
//Used to be an example for how to keep the connection thread and the working thread separate
//in order to fix the issue of missing heartbeat intervals due to processing on the same thread
const cluster = require('cluster');
var amqp = require('amqp-connection-manager');
var config = require('./config.json');
var working_queue = "Test_Queue";
//DONT REALLY DO THIS
var rabbit_msg_data;
//******* CLUSTER SETUP *******
// This will spawn off the number of worker for this process.
if (cluster.isMaster) {
console.log("Master "+process.pid+" is running");
worker = cluster.fork();
cluster.on('exit', (worker, code, signal) => {
if(signal)
{
console.log("worker was killed by signal: "+signal);
console.log(worker);
}
else if (code !== 0)
{
console.log("worker exited with error code: "+code);
console.log(worker);
}
else
{
console.log("Worker "+worker.process.pid+" exited successfully");
console.log(worker);
//Not sure if this works this way or if I need to put this worker into variables
}
});
//testing sending a message back and forth
// setTimeout(function() {
// worker.send("I got a request!");
// }, 1000);
//******** RABBIT MQ CONNECTION **********
// Create a connection manager to rabbitmq
var connection = amqp.connect(config.rabbit_connections_arr, {json: true, heartbeatIntervalInSeconds: 2});
connection.on('connect', function() {
console.log('Connected to rabbitmq');
});
connection.on('disconnect', function(params) {
console.log('Disconnected from rabbitmq:', params.err.stack);
});
// Set up a channel listening for messages in the queue.
var channelWrapper_listening = connection.createChannel({
setup: function(channel) {
// `channel` here is a regular amqplib `ConfirmChannel`.
return Promise.all([
channel.assertQueue(working_queue, {durable: true}),
channel.prefetch(1),
channel.consume(working_queue, function(data){
rabbit_msg_data = data;
worker.send(data.content.toString());
}, requeue = false)
]);
}
});
worker.on('message', function(msg){
// console.log("Worker to Master (ack): ", msg.content.toString());
console.log("Worker to Master (ack): ", msg);
//console.log(msg.content.toString());
channelWrapper_listening.ack(rabbit_msg_data);
});
}
else //All worker processes (MAIN LOGIC)
{
console.log("Worker "+process.pid+" started");
process.on('message',function(msg){
console.log("Master to Worker (working): ", msg);
//send msg back when done working on it.
setTimeout(function() {
process.send(msg);
}, 5000);
});
}
I need a job scheduler that runs a couple of jobs in a specific time interval. So, I used agenda module for doing this task. What happens is when the server is restarted the jobs get executed and this happens for 4 times in regular interval basis and after that agenda stops working without throwing any error. I tried canceling the jobs and stopping agenda whenever the server stops. So that agendaJobs creates new collection every time the server is restarted. But still, I experience the same issue. Since, I have multiple node instances running, I need the jobs to run only once in that time interval. Hence, I used agenda. Is there any different schedulers which can do the same or am I doing something wrong?
var agenda = new Agenda({db: {address: mongoConnectionString}});
var startCron = function(server, callback){
agenda.define('job1', function(job, done) {
job1.execute();
});
agenda.define('job2', function(job, done) {
job2.execute();
});
agenda.define('job3', function(job, done) {
job3.execute();
});
agenda.on('ready', function() {
agenda.every('10 minutes', ['job1', 'job2',
'job3']);
agenda.start();
console.log("Cron job setup completed.");
callback(null, server);
});
}
// this is my kill process
var killProcess = function() {
try {
mongoose.disconnect();
console.log(arguments);
console.log("Connection got closed.");
} catch (e) {
console.log(e);
}
agenda.cancel({}, function(err, numRem){
console.log("Cancelling Agenda Jobs", err, numRem);
agenda.stop(function() {
console.log("Stopping Agenda")
setTimeout(function(){
process.exit(1);
}, 2000);
});
});
};
I am stuck here due to a simple event related issue. Here is the issue:
I have created a cluster using cluster.js and forked server.js from
cluster.js.
I have put a timer from cluster.js and after every 1 min I am
triggering an event 'testTimer'. I have used a event file to do
it.
I am trying to capture this event 'testTimer' from the child
process using the same file I have imported into server.js and doing
a .on('testTimer', callback)
However, the events are not captured in any of the processes. I have tried making the event global and assign the event globally to a symbol but was unable to get it work/capture event as well.
Here is the codes:
cluster.js (child process creator)
...require > events.js...
... create cluster logic...
setInterval(function () {
evt.emit('testTimer', {tester: 'test'});
evt.tester();
}, 1000);
server.js (child process)
...require > events.js...
evt.on('testTimer', function (data) {
console.log('Starting Sync ', data);
});
events.js (common file for events)
var util = require("util");
var EventEmitter = require("events").EventEmitter;
function test () {
EventEmitter.call(this);
}
test.prototype.tester = function (){
this.emit('testTimer', {missed: 'this'})
}
util.inherits(test, EventEmitter);
module.exports = test;
EventEmitter instances can't reach beyond the bounds of a process. If you want to communicate between parent and children, use worker.send():
// cluster.js
setInterval(function () {
for (const id in cluster.workers) {
cluster.workers[id].send({ type : 'testTimer', data : { tester : 'test' }});
}
}, 1000);
// server.js
process.on('message', function(message) {
if (message.type === 'testTimer') {
console.log('Starting Sync ', message.data);
}
})
I'm using node-cron module for scheduling tasks in Node.js application. I also want run the application in several processes using core cluster module.
Running application in several processes ends up in scheduled tasks execution in each process (e.g. if task was to send an email the email would be sent multiple times).
What are the best practices/possible ways of running cron job along with cluster module? Should I create some separate process which will handle only cron job and do not accept any requests. If yes, how can I do that in a right way?
If are using PM2,
You can use an environment variable provided by PM2 itself called NODE_APP_INSTANCE which requires PM2 2.5 or greater.
NODE_APP_INSTANCE environment variable can be used to determine difference between process, for example you may want to run a cronjob only on one process, you can just do this
if(process.env.NODE_APP_INSTANCE == 0) {
//schedule your cron job here since this part will be executed for only one cluster
}
,
Since two processes can never have the same number.
More Info on PM2 official doc here.
After some research I ended up with "Distributed locks using Redis" solution.
There is node module for that: node-redis-warlock.
Hope this answer will be useful for someone else.
UPDATE. Minimal sample code:
var Warlock = require('node-redis-warlock'),
redis = require('redis');
// Establish a redis client
redis = redis.createClient();
// and pass it to warlock
var warlock = new Warlock(redis);
function executeOnce (key, callback) {
warlock.lock(key, 20000, function(err, unlock){
if (err) {
// Something went wrong and we weren't able to set a lock
return;
}
if (typeof unlock === 'function') {
setTimeout(function() {
callback(unlock);
}, 1000);
}
});
}
// Executes call back only once
executeOnce('every-three-hours-lock', function(unlock) {
// Do here any stuff that should be done only once...
unlock();
});
UPDATE 2. More detailed example:
const CronJob = require('cron').CronJob;
const Warlock = require('node-redis-warlock');
const redis = require('redis').createClient();
const warlock = new Warlock(redis);
const async = require('async');
function executeOnce (key, callback) {
warlock.lock(key, 20000, function(err, unlock) {
if (err) {
// Something went wrong and we weren't able to set a lock
return;
}
if (typeof unlock === 'function') {
setTimeout(function() {
callback(unlock);
}, 1000);
}
});
}
function everyMinuteJobTasks (unlock) {
async.parallel([
sendEmailNotifications,
updateSomething,
// etc...
],
(err) => {
if (err) {
logger.error(err);
}
unlock();
});
}
let everyMinuteJob = new CronJob({
cronTime: '*/1 * * * *',
onTick: function () {
executeOnce('every-minute-lock', everyMinuteJobTasks);
},
start: true,
runOnInit: true
});
/* Actual tasks */
let sendEmailNotifications = function(done) {
// Do stuff here
// Call done() when finished or call done(err) if error occurred
}
let updateSomething = function(done) {
// Do stuff here
// Call done() when finished or call done(err) if error occurred
}
// etc...
I think you can use the node cluster module, and there you can write your code to run in the master cluster only
const cluster = require('cluster');
if (cluster.isMaster) {
// Write your code which you want to execute in the master cluster only
}
This is a node way to handle cluster, of course, you can use any tool like pm2 to handle this.
I actually do not like the redis approach that is also used in the cron-cluster npm plugin, because I do not want to have that redis server running on my maschine and maintain it, too.
I would like to discuss this approach with you:
Pro: we do not need to use redis
Con: cron jobs are always running on the same worker
I use the message passing only for this, if you use it for other things, you want to pass the information that
if (cluster.isMaster) {
// Count the machine's CPUs
var cpuCount = require('os').cpus().length;;
// Create a worker for each CPU
for (var i = 0; i < cpuCount; i += 1) {
cluster.fork();
}
cluster.on('fork', (worker) => {
console.log("cluster forking new worker", worker.id);
});
// have a mainWorker that does the cron jobs.
var mainWorkerId = null;
cluster.on('listening', (worker, address) => {
console.log("cluster listening new worker", worker.id);
if(null === mainWorkerId) {
console.log("Making worker " + worker.id + " to main worker");
mainWorkerId = worker.id;
worker.send({order: "startCron"});
}
});
// Listen for dying workers if the mainWorker dies, make a new mainWorker
cluster.on('exit', function (worker, code, signal) {
console.log('Worker %d died :(', worker.id);
if(worker.id === mainWorkerId) {
console.log("Main Worker is dead...");
mainWorkerId = null;
}
console.trace("I am here");
console.log(worker);
console.log(code);
console.log(signal);
cluster.fork();
});
// Code to run if we're in a worker process
} else {
// other code like setup app and stuff
var doCron = function() {
// setup cron jobs...
}
// Receive messages from the master process.
process.on('message', function(msg) {
console.log('Worker ' + process.pid + ' received message from master.', message);
if(message.order == "startCron") {
doCron();
}
});
}
I also have a problem with cluster module and finally i found sample way to solve problem.
Let master cluster execute cronJob.
My project use Kue to manage jobs. When cronJob run i get a list of jobs.
index.js
global.cluster = require('cluster');
if (cluster.isMaster) {
const cpuCount = require('os').cpus().length;
for (let i = 0; i < cpuCount; i += 1) {
cluster.fork();
}
} else {
// start your express server here
require('./server')
}
cluster.on('exit', worker => {
logger.warn('Worker %d died :(', worker.id);
cluster.fork();
});
cron.js
const cron = require('cron').CronJob;
const job = new cron('* * * * *', async () => {
if (cluster.isMaster) {
console.log('cron trigger');
}
});
job.start();
Hope this help.
I have a grunt task from which I would like to run a node command. The command is not giving any error when I run it, but I was expecting some console output from the task, which I don't seem to be getting at all.
What am I missing in order to run this node task?
grunt.registerTask('asyncfoo', 'My "asyncfoo" task.', function() {
// Force task into async mode and grab a handle to the "done" function.
var done = this.async();
// Run some sync stuff.
grunt.log.writeln('Processing task...');
grunt.util.spawn({ cmd: 'node', args: ['S3ListBuckets.js']});
// And some async stuff.
setTimeout(function() {
grunt.log.writeln('All done!');
done();
}, 1000);
});
!-- if someone else is wanting to do something similar here is the code
module.exports = function(grunt) {
grunt.registerTask('asyncfoo', 'My "asyncfoo" task.', function() {
// Force task into async mode and grab a handle to the "done" function.
var done = this.async();
// Run some sync stuff.
grunt.log.writeln('Processing task...');
grunt.util.spawn({ cmd: 'node', args: ['S3ListBuckets.js'], opts: {stdio: 'inherit'}});
});
};
!-- list buckets
var fs = require('fs');
var aws = require('aws-sdk');
aws.config.loadFromPath('./grunt-aws.json');
var s3 = new aws.S3();
s3.listBuckets(function (err, data) {
if (err) {
console.log("Error:", err);
}
else {
for (var index in data.Buckets) {
var bucket = data.Buckets[index];
console.log("Bucket: ", bucket.Name, ' : ', bucket.CreationDate);
}
}
});
The answer https://stackoverflow.com/a/15045126/519995 suggests using the parameter opts: {stdio: 'inherit'} to have the spawned output streamed into the parent output stream.
That same answer also lists other alternatives: listening to data event, or piping the streams as you wish.
Also, using timeouts to wait for async tasks is NOT a good idea. If all you are waiting for is the spawned process you can use a callback to know when its done. If you have more complex sync I suggest starting a new StackOverflow question.