Can someone explain the webworker-thread example? - node.js

var Worker = require('webworker-threads').Worker;
require('http').createServer(function (req,res) {
var fibo = new Worker(function() {
function fibo (n) {
return n > 1 ? fibo(n - 1) + fibo(n - 2) : 1;
}
// which onmessage does this this refer to?
onmessage = function (event) { //reference 1
postMessage(fibo(event.data));
}
});
fibo.onmessage = function (event) { //reference 2
res.end('fib(40) = ' + event.data);
};
fibo.postMessage(40);
}).listen(port);
This is the code found as an example for the webworker class.
I was looking at the API and doen't seem to understand what reference 1 in the above code is referring to. Why does the postMessage(40) hit the inner onmessage function and not the fibo.onmessage function?

The main point to note here is that the onmessage() and postmessage() is used as message bearers between both the main thread and the worker thread. This can be confusing initially. So the flow goes like this
Create a worker thread .
var fibo= new Worker...
This will spawn another JavaScript thread in the node. It can run in parallel in the background and use all the available CPU cores. Otherwise in node due to its single threaded model, taking advantage of multiple CPU cores is not possible (hence worker threads is a good approach for handling CPU-bound tasks in node)
In the worker thread we define
a)how to process the request/work it receives - the onmessage() does this job. It listens for any incoming work request and act on it.
onmessage= function (event) { //reference 1
postMessage(fibo(event.data));
}
b) how to communicate back to the main thread once work is done-
postMessage does this job.
postMessage(fibo(event.data));
In the main thread :-
a. Call the worker thread and give it a task to execute -ie. using postmessage (By now you got the dance)
fibo.postMessage(40);
b. Define listener regarding the action to take once the worker thread finishes it job and responds back. ie. using onmessage.
fibo.onmessage = function (event) { //reference 2
res.end('fib(40) = ' + event.data);
};

try this code:
var port=8080;
var Worker = require('webworker-threads').Worker;
var fibo = new Worker(function() {
function fibo (n) {
return n > 1 ? fibo(n - 1) + fibo(n - 2) : 1;
}
// which onmessage does this this refer to?
onmessage = function (event) { //reference 1
console.log("fibo.onmessage inside");
postMessage(fibo(event.data));
}
});
fibo.onmessage = function (event) { //reference 2
console.log("fibo.onmessage outside")
console.log('fib(40) = ' + event.data);
};
fibo.postMessage(40);
it gives
fibo.onmessage inside
fibo.onmessage outside
fib(40) = 165580141

The key thing is, it has to run on another node, so this implies that at some point, its doing (not even close but still applies)
class worker {
constructor(fnToRun){
child_process.spawn('thread-creator-process', [
'-fn', escape(fnToRun.toString())
]);
}
}
so now the other dude could do something like
var result = new Function('context',
'with(context){'+
' ' + fnAsString +
'}'
result(thisThreadContext);
now context has the onmessage reference it will be created, and it can already have a reference to the postMessage and those things, with a little bit of parsing you have a new thread to work with
at least i think it that way

Related

bull queue block job while got job on different function

I need an approach to block worker to process a job while I called getJob on different function. I've looked around but couldn't find a solution for that.
I have following setup.
In nodeJS with express, I have worker node.
Job created with delayed state.
Job is being accessed in different function
async function jobReader(id) {
const job = await queue.getJob(id);
/* do some stuff */
await job.remove();
}
Worker node that independently processes jobs. Job will be only processed if the delayed time is finishes.
queue.process(async (job) => {
/* do some stuff */
})
queue.getJob(id) doesn't block the worker to process the job. So there's race on worker processing the job and jobReader processing the job. I am writing some result to DB according to job status. So the race condition is not acceptable.
Apparently, getJob is not blocking the worker to process the job. Is there any way to lock or block to worker work on the job, if the job is read by some other function with getJob function.
Any help or documentation will be appreciated.
Thanks
I guess you should change your architecture a little. Worker Node does exactly what it is intended for, it takes jobs and runs them. So instead of blocking the queue in some way, you should only add the job to the queue when the user approved/canceled/failed it (or did not sent a response after 120 seconds).
If I understood you right, this should give you an idea how to have control over jobs between different requests:
// this is YOUR queueu object. I don't now implentation but think
// of it like this..
const queue = new Queue()
// a variable holding the pending jobs which are not timeouted
// or explicitly approved/canceled/failed by user
const waitingJobs = {
}
// This could be your location where the user calls the api for creating a job.
app.post('/job', (req, res) => {
// create the job as the user requested it
const job = createJob(req)
// Add a timeout for 120 seconds into your waitingJobs array.
// So if the user does not respond after that time, the job will
// be added to queue! .
const timeout = setTimeout(() => {
queue.add(job)
// remove the reference after adding, garbage collection..
waitingJobs[job.id] = null
// job is added to queue automatically after 120 seconds
}, 120 * 1000)
// store the timeout in the job object!
job.timeout = timeout
// store the waiting job!
waitingJobs[job.id] = job
// respond to user, send back id so client can do another
// request if wanted.
req.status(200).json({ message: 'Job created!', id: job.id })
})
app.post('/job/:id', (req, res) => {
const id = req.params.id
if (!id) {
req.status(400).json('bad job id provided')
return
}
// get the queued job:
const job = waitingJobs[id]
if (!job) {
req.status(400).json('Job nod found OR job already processed. Job id: ' + id)
return
}
// now the user responded to a specific job, clean the
// timeout first, so it won't be added to queue!
if (job.timeout) {
clearTimeout(job.timeout)
}
// Now the job won't be processed somewhere else!
// you can do whatever you want...
// example:
// get the action
const action = req.query.action
if(!action) {
res.status(400).json('Bad action provided: ' + action)
return
}
if(action === 'APPROVE') {
// job approved! , add it to queue so worker node
// can process it..
queue.add(job)
}
if(action === 'CANCEL') {
// do something else...
}
/// etc..
// ofc clear the job reference after you did something..
waitingJobs[job.id] = null
// since everything worked, inform user the job will now be processed!
res.status(200).json('Job ' + job.id + 'Will now be processed')
})

async.queue concurrent tasks

I am using async.queue to ensure that certain file copies in a service happen at most n concurrently, but watching the files copy sometimes I see a lot more than what the queue allows. Does anyone see something I may have missed in the below implementation?
createQueue(limit: number) {
let self = this;
return async.queue(function(cmdObj, callback) {
console.log("Beginning copy");
let cmd = cmdObj.cmd;
let args = cmdObj.args;
let request = cmdObj.req;
request.state = State.IN_PROGRESS;
self.reportStatus(request.destination);
const proc = spawn(cmd, args); //uses an rsync command upstream
proc.on("close", code => {
if (code !== 0) {
request.state = State.ERRORED;
self.reportStatus(request.destination); // these just report to the caller
statusMap.delete(request.destination);
} else {
fs.rename(request.destination + ".part", request.destination);
request.state = State.COMPLETED;
self.reportStatus(request.destination); // same here
statusMap.delete(request.destination);
}
callback();
});
proc.on("error", err => {
console.error("COPY ERR: " + err);
});
}, limit); // limit here, for example, may be two, but I see four copies concurrently
}
EDIT:
I now believe this is a side effect of the rest of the system...queues being cleared and reinitialized AFTER copies have started...so when new items are added to the reinitialized queues, they kick off immediately, as the system has no idea if something has been handed off to userland and is currently running.
So, this was user error...PEBCAK! Posting the solution more as a cautionary tale:
The queues above were working as designed, but I had an endpoint for the calling server to clear the queues as necessary; the problem was i was using kill() and re-initializing the queues, losing all track of any jobs in progress and their callbacks. As soon as a new item hit the fresh queue, it would think nothing was happening and spawn a new copy process. I resolved by using remove to clear the queues instead of re-initializing.

child_process spawn Race condition possibility in nodejs

I'm starting to learn and use node and I like it but I'm not really sure how certain features work. Maybe you can help me resolve one such issue:
I want to spawn local scripts and programs from my node server upon rest commands. looking at the fs library I saw the example below of how to spawn a child process and add some pipes/event handlers on it.
var spawn = require('child_process').spawn,
ps = spawn('ps', ['ax']),
grep = spawn('grep', ['ssh']);
ps.stdout.on('data', function (data) {
grep.stdin.write(data);
});
ps.stderr.on('data', function (data) {
console.log('ps stderr: ' + data);
});
ps.on('close', function (code) {
if (code !== 0) {
console.log('ps process exited with code ' + code);
}
grep.stdin.end();
});
grep.stdout.on('data', function (data) {
console.log('' + data);
});
grep.stderr.on('data', function (data) {
console.log('grep stderr: ' + data);
});
grep.on('close', function (code) {
if (code !== 0) {
console.log('grep process exited with code ' + code);
}
});
What's weird to me is that I don't understand how I can be guaranteed that the event handler code will be registered before the program starts to run. It's not like there's a 'resume' function that you run to start up the child. Isn't this a race condition? Granted the condition would be minisculy small and would almost never hit because its such a short snipping of code afterward but still, if it is I'd rather not code it this way out of good habits.
So:
1) if it's not a race condition why?
2) if it is a race condition how could I write it the right way?
Thanks for your time!
Given the slight conflict and ambiguity in the accepted answer's comments, the sample and output below tells me two things:
The child process (referring to the node object returned by spawn) emits no events even though the real underlying process is live / executing.
The pipes for the IPC are setup before the child process is executed.
Both are obvious. The conflict is w.r.t. interpretation of the OP's question:-
Actually 'yes', this is the epitome of a data race condition if one needs to consider the real child process's side effects. But 'no', there's no data race as far as IPC pipe plumbing is concerned. The data is written to a buffer and retrieved as a (bigger) blob as and when (as already well described) the context completes allowing the event loop to continue.
The first data event seen below pushes not 1 but 5 chunks written to stdout by the child process whilst we were blocking.. thus nothing is lost.
sample:
let t = () => (new Date()).toTimeString().split(' ')[0]
let p = new Promise(function (resolve, reject) {
console.log(`[${t()}|info] spawning`);
let cp = spawn('bash', ['-c', 'for x in `seq 1 1 10`; do printf "$x\n"; sleep 1; done']);
let resolved = false;
if (cp === undefined)
reject();
cp.on('error', (err) => {
console.log(`error: ${err}`);
reject(err);
});
cp.stdout.on('data', (data) => {
if (!resolved) {
console.log(`[${t()}|info] spawn succeeded`);
resolved = true;
resolve();
}
process.stdout.write(`[${t()}|data] ${data}`);
});
let ts = parseInt(Date.now() / 1000);
while (parseInt(Date.now() / 1000) - ts < 5) {
// waste some cycles in the current context
ts--; ts++;
}
console.log(`[${t()}|info] synchronous time wasted`);
});
Promise.resolve(p);
output:
[18:54:18|info] spawning
[18:54:23|info] synchronous time wasted
[18:54:23|info] spawn succeeded
[18:54:23|data] 1
2
3
4
5
[18:54:23|data] 6
[18:54:24|data] 7
[18:54:25|data] 8
[18:54:26|data] 9
[18:54:27|data] 10
It is not a race condition. Node.js is single threaded and handles events on a first come first serve basis. New events are put at the end of the event loop. Node will execute your code in a synchronous manner, part of which will involve setting up event emitters. When these event emitters emit events, they will be put to the end of the queue, and will not be handled until Node finishes executing whatever piece of code its currently working on, which happens to be the same code that registers the listener. Therefore, the listener will always be registered before the event is handled.

How to make node works concurrently?

Node.js is famous for concurrency, however, I'm confused by how to make it work concurrently. I started two requests from Chrome one by one very quickly, and I Expected the outputs in console should be:
"get a new request"
immediately after my second request, "get a new request" should be printed
after several seconds, "end the new request"
after several seconds, "end the new request"
However, what I saw is:
"get a new request"
after several seconds, "end the new request"
"get a new request"
after several seconds, end the new request
That means the second request is NOT handled until the first one is done. Below is my sample code, anything I missed?
var http = require("http");
var url = require("url");
function start(route) {
http.createServer(function(request, response) {
console.log('get a new request');
// a time consuming loop
for (var i=0; i<10000000000; ++i) {
}
route(url.parse(request.url).pathname);
response.writeHead(200, {"Content-Type": "text/plain"});
response.end();
console.log('end the new request');
}).listen(5858);
}
function saySomething(something) {
console.log(something);
}
exports.start = start;
exports.saySomething = saySomething;
You don't have to do anything.
It's based on non blocking I/O. Put simply, there is an event loop. A certain set of sync code are run, once done, the next iteration is run that picks up the next set of sync code to run. Anytime an async op is run (db fetch, setTimeout, reading a file, etc) the next tick of the event loop is run. This way there is never any code just waiting.
It's not threaded. In your example, the for loop is in one continuous chunk of code, so js will run the entire for loop before it can handle another http request.
Try putting a setTimeout around the for loop so that node can switch to the next event loop and in your case handle a web request.
node can't handle these:
for (var i=0; i<10000000000; ++i) {}
concurrently. But it handles IO concurrently
You might want to look at Clusters:
http://nodejs.org/api/cluster.html#cluster_how_it_works
http://rowanmanning.com/posts/node-cluster-and-express/
T̶h̶i̶s̶ ̶i̶s̶ ̶t̶h̶e̶ ̶e̶x̶p̶e̶c̶t̶e̶d̶ ̶b̶e̶h̶a̶v̶i̶o̶r̶,̶ ̶w̶e̶ ̶c̶a̶l̶l̶ ̶t̶h̶i̶s̶ ̶̶b̶l̶o̶c̶k̶i̶n̶g̶̶.̶ ̶T̶h̶e̶ ̶s̶o̶l̶u̶t̶i̶o̶n̶ ̶f̶o̶r̶ ̶h̶a̶n̶d̶l̶i̶n̶g̶ ̶c̶o̶n̶c̶u̶r̶r̶e̶n̶t̶ ̶r̶e̶q̶u̶e̶s̶t̶ ̶i̶s̶ ̶m̶a̶k̶i̶n̶g̶ ̶t̶h̶e̶ ̶c̶o̶d̶e̶ ̶̶n̶o̶n̶-̶b̶l̶o̶c̶k̶i̶n̶g̶̶.̶ ̶A̶s̶ ̶s̶o̶o̶n̶ ̶a̶s̶ ̶y̶o̶u̶ ̶c̶a̶l̶l̶e̶d̶ ̶̶r̶e̶s̶p̶o̶n̶s̶e̶.̶w̶r̶i̶t̶e̶H̶e̶a̶d̶̶ ̶t̶h̶e̶ ̶c̶o̶d̶e̶ ̶b̶e̶g̶a̶n̶ ̶b̶l̶o̶c̶k̶i̶n̶g̶ ̶w̶a̶i̶t̶i̶n̶g̶ ̶f̶o̶r̶ ̶̶r̶e̶s̶p̶o̶n̶s̶e̶.̶e̶n̶d̶̶.
EDIT 7/8/14:
Had to deal with this problem recently and found out you can use threads for this:
https://www.npmjs.org/package/webworker-threads
Webworker-threads provides an asynchronous API for CPU-bound tasks that's missing in Node.js:
var Worker = require('webworker-threads').Worker;
require('http').createServer(function (req,res) {
var fibo = new Worker(function() {
function fibo (n) {
return n > 1 ? fibo(n - 1) + fibo(n - 2) : 1;
}
this.onmessage = function (event) {
postMessage(fibo(event.data));
}
});
fibo.onmessage = function (event) {
res.end('fib(40) = ' + event.data);
};
fibo.postMessage(40);
}).listen(port);
And it won't block the event loop because for each request, the fibo worker will run in parallel in a separate background thread.

Best way to execute parallel processing in Node.js

I'm trying to write a small node application that will search through and parse a large number of files on the file system.
In order to speed up the search, we are attempting to use some sort of map reduce. The plan would be the following simplified scenario:
Web request comes in with a search query
3 processes are started that each get assigned 1000 (different) files
once a process completes, it would 'return' it's results back to the main thread
once all processes complete, the main thread would continue by returning the combined result as a JSON result
The questions I have with this are:
Is this doable in Node?
What is the recommended way of doing it?
I've been fiddling, but come no further then following example using Process:
initiator:
function Worker() {
return child_process.fork("myProcess.js");
}
for(var i = 0; i < require('os').cpus().length; i++){
var process = new Worker();
process.send(workItems.slice(i * itemsPerProcess, (i+1) * itemsPerProcess));
}
myProcess.js
process.on('message', function(msg) {
var valuesToReturn = [];
// Do file reading here
//How would I return valuesToReturn?
process.exit(0);
}
Few sidenotes:
I'm aware the number of processes should be dependent of the number of CPU's on the server
I'm also aware of speed restrictions in a file system. Consider it a proof of concept before we move this to a database or Lucene instance :-)
Should be doable. As a simple example:
// parent.js
var child_process = require('child_process');
var numchild = require('os').cpus().length;
var done = 0;
for (var i = 0; i < numchild; i++){
var child = child_process.fork('./child');
child.send((i + 1) * 1000);
child.on('message', function(message) {
console.log('[parent] received message from child:', message);
done++;
if (done === numchild) {
console.log('[parent] received all results');
...
}
});
}
// child.js
process.on('message', function(message) {
console.log('[child] received message from server:', message);
setTimeout(function() {
process.send({
child : process.pid,
result : message + 1
});
process.disconnect();
}, (0.5 + Math.random()) * 5000);
});
So the parent process spawns an X number of child processes and passes them a message. It also installs an event handler to listen for any messages sent back from the child (with the result, for instance).
The child process waits for messages from the parent, and starts processing (in this case, it just starts a timer with a random timeout to simulate some work being done). Once it's done, it sends the result back to the parent process and uses process.disconnect() to disconnect itself from the parent (basically stopping the child process).
The parent process keeps track of the number of child processes started, and the number of them that have sent back a result. When those numbers are equal, the parent received all results from the child processes so it can combine all results and return the JSON result.
For a distributed problem like this, I've used zmq and it has worked really well. I'll give you a similar problem that I ran into, and attempted to solve via processes (but failed.) and then turned towards zmq.
Using bcrypt, or an expensive hashing algorith, is wise, but it blocks the node process for around 0.5 seconds. We had to offload this to a different server, and as a quick fix, I used essentially exactly what you did. Run a child process and send messages to it and get it to
respond. The only issue we found is for whatever reason our child process would pin an entire core when it was doing absolutely no work.(I still haven't figured out why this happened, we ran a trace and it appeared that epoll was failing on stdout/stdin streams. It would also only happen on our Linux boxes and would work fine on OSX.)
edit:
The pinning of the core was fixed in https://github.com/joyent/libuv/commit/12210fe and was related to https://github.com/joyent/node/issues/5504, so if you run into the issue and you're using centos + kernel v2.6.32: update node, or update your kernel!
Regardless of the issues I had with child_process.fork(), here's a nifty pattern I always use
client:
var child_process = require('child_process');
function FileParser() {
this.__callbackById = [];
this.__callbackIdIncrement = 0;
this.__process = child_process.fork('./child');
this.__process.on('message', this.handleMessage.bind(this));
}
FileParser.prototype.handleMessage = function handleMessage(message) {
var error = message.error;
var result = message.result;
var callbackId = message.callbackId;
var callback = this.__callbackById[callbackId];
if (! callback) {
return;
}
callback(error, result);
delete this.__callbackById[callbackId];
};
FileParser.prototype.parse = function parse(data, callback) {
this.__callbackIdIncrement = (this.__callbackIdIncrement + 1) % 10000000;
this.__callbackById[this.__callbackIdIncrement] = callback;
this.__process.send({
data: data, // optionally you could pass in the path of the file, and open it in the child process.
callbackId: this.__callbackIdIncrement
});
};
module.exports = FileParser;
child process:
process.on('message', function(message) {
var callbackId = message.callbackId;
var data = message.data;
function respond(error, response) {
process.send({
callbackId: callbackId,
error: error,
result: response
});
}
// parse data..
respond(undefined, "computed data");
});
We also need a pattern to synchronize the different processes, when each process finishes its task, it will respond to us, and we'll increment a count for each process that finishes, and then call the callback of the Semaphore when we've hit the count we want.
function Semaphore(wait, callback) {
this.callback = callback;
this.wait = wait;
this.counted = 0;
}
Semaphore.prototype.signal = function signal() {
this.counted++;
if (this.counted >= this.wait) {
this.callback();
}
}
module.exports = Semaphore;
here's a use case that ties all the above patterns together:
var FileParser = require('./FileParser');
var Semaphore = require('./Semaphore');
var arrFileParsers = [];
for(var i = 0; i < require('os').cpus().length; i++){
var fileParser = new FileParser();
arrFileParsers.push(fileParser);
}
function getFiles() {
return ["file", "file"];
}
var arrResults = [];
function onAllFilesParsed() {
console.log('all results completed', JSON.stringify(arrResults));
}
var lock = new Semaphore(arrFileParsers.length, onAllFilesParsed);
arrFileParsers.forEach(function(fileParser) {
var arrFiles = getFiles(); // you need to decide how to split the files into 1k chunks
fileParser.parse(arrFiles, function (error, result) {
arrResults.push(result);
lock.signal();
});
});
Eventually I used http://zguide.zeromq.org/page:all#The-Load-Balancing-Pattern, where the client was using the nodejs zmq client, and the workers/broker were written in C. This allowed us to scale this across multiple machines, instead of just a local machine with sub processes.

Resources