node.io : when the job is done, do it again - node.js

I'm building a scraper with node.io.
The page I want to scrape has new content every minute. I would like to run my job again and again every minute.
(Ok I could do that with a bash script, but I would like to stay in javascript)
This is a basic job :
var nodeio = require('node.io'), options = {timeout: 10};
exports.job = new nodeio.Job(options, {
input: ['hello', 'foobar', 'weather'],
run: function (keyword) {
this.getHtml('http://www.google.com/search?q=' + encodeURIComponent(keyword), function (err, $) {
var results = $('#resultStats').text.toLowerCase();
this.emit(keyword + ' has ' + results);
});
}
});
How could I do that ? I'm a beginner in node.js, i tried setInterval around the job (: without success.

Try this (run with "node<myfile.js>" instead of "node.io<myfile.js>"):
var nodeio = require('node.io'), options = {timeout: 10};
var job = {
input: ['hello', 'foobar', 'weather'],
run: function (keyword) {
this.getHtml('http://www.google.com/search?q=' + encodeURIComponent(keyword), function (err, $) {
var results = 'test';//$('#resultStats').text.toLowerCase();
this.emit(keyword + ' has ' + results);
});
}
};
setInterval(function(){
nodeio.start(new nodeio.Job(options, job), options, function(){});
}, 5000);
The problem you were running into is the following block of code in node.io that exits node when you don't provide a callback when running the job:
//Default behaviour is to exit once the job is complete
callback = callback || function (err) {
if (err) {
utils.status.error(err);
}
process.exit();
};

Related

gulp task hangs after job done in nodejs

I defined the simple gulp task that can be run in terminal with gulp load. The purpose is to get number of users that does not have location property set.
However, after the number returned succesfully, the process hangs in terminal and I need to ctrl-c to stop it. Note, This is not a async call and uses mongoose plug-in to access DB.
gulp.task('load', function () {
dbCall.getUserNumberWithoutLocation();
});
var getUserNumberWithoutLocation = function() {
var query = User.find({ 'location': null });
query.exec(function(err, users) {
for (var i = 0; i < users.length; i++) {
console.log(users[i].location);
}
console.log(users.length);
})
};
Despite the prompt still does not return after gulp.task done (which probably is my local machine issue), I put my code here. The callback function is passed into the inner getUserNumberWithoutLocation function, in which the it is called once User.find is completed.
gulp.task('load', ['style'], function (callback) {
dbCall.getUserNumberWithoutLocation(callback);
});
exports.getUserNumberWithoutLocation = function(callback) {
var query = User.find({'location': null});
query.exec(function(err, users) {
console.log(users.length);
if (err) {
return callback(err);
}
callback();
});
};

Exit Node Process After Successful fs.appendFile

I'm having trouble create processes in parallel with Node while exiting when they're done with a simple HTTP GET request. I've noticed that if I fire a process.exit() inside of a callback for appendFile, some files will not be created or appended in a Node cluster setup. Ideally, the way below is how I would like to fire events since the process is exited as soon as the job is done:
var rp = require("request-promise");
config = require("./config"),
cluster = require("cluster"),
os = require("os"),
fs = require("fs");
var keywordArray = [
'keyword1',
'keyword2',
...
];
if (cluster.isMaster) {
var numCPUs = os.cpus().length;
var clusterDivision = Math.ceil(keywordArray.length/numCPUs);
// Reset the json if previously set
keywordArray.forEach(function(arrayItem) {
fs.unlink(config.dataDirectory + arrayItem + '.json', function(err) {
if (err) console.error(err);
console.log('successfully unlinked ' + arrayItem + '.json from ' + config.dataDirectory);
});
});
// Create a worker for each CPU
// Seperate the array out evenly for each worker
for (var j=1;j<=numCPUs;j++) {
var tempArray = [];
var removed = keywordArray.splice(0, clusterDivision);
if (removed.length > 0) {
// The array contains something so let's do something with the keyword
console.log('creating a worker');
cluster.fork().send(removed);
} else {
// We don't need a cluster here
}
}
process.on('exit', function() {
console.log('exited');
});
} else if (cluster.isWorker) {
// Code to run if we're in a worker process
// Send the object we created above from variables so they're available to the workers
process.on('message', function(seperatedArrayItem) {
seperatedArrayItem.forEach(function(arrayItem) {
function radarRequest(err, response, body) {
var responseBody = JSON.parse(body);
console.log(arrayItem);
fs.appendFileSync(config.dataDirectory + arrayItem + '.json', JSON.stringify(responseBody.results, null, '\t'), function (err) {
if (err) console.err(err);
console.log('success writing file');
});
}
rp({
url: config.radarSearchURI +
'?key='+ config.apiKey +
'&location=' + config.latitude + ',' + config.longitude +
'&radius=' + config.searchRadius +
'&keyword=' + arrayItem, headers: config.headers
}, radarRequest);
});
setTimeout(function() {
process.exit(0);
}, 5000);
});
}
The only way I can make sure all files are properly appended is by using a Timeout, which is exactly what I don't want to - and shouldn't - do. Is there another way I can ensure an appendFile has happened successfully and then kill the node process? Here's a way that works (assuming the process doesn't take longer than 5 seconds):
process.on('message', function(seperatedArrayItem) {
seperatedArrayItem.forEach(function(arrayItem) {
function radarRequest(err, response, body) {
var responseBody = JSON.parse(body);
console.log(arrayItem);
fs.appendFile(config.dataDirectory + arrayItem + '.json', JSON.stringify(responseBody.results, null, '\t'), function (err) {
if (err) console.err(err)
console.log('success writing file');
});
}
rp({
url: config.radarSearchURI +
'?key='+ config.apiKey +
'&location=' + config.latitude + ',' + config.longitude +
'&radius=' + config.searchRadius +
'&keyword=' + arrayItem, headers: config.headers
}, radarRequest);
});
setTimeout(function() {
process.exit(0);
}, 5000);
});
You can use an async flow control module like async to kill the process after all files are written. I'd also recomment cluster.worker.disconnect() so that the node process will simple exit gracefully, but that isn't a requirement.
async.forEach(seperatedArrayItem, function(item, done){
// append file and call 'done' when it is written.
}, function(){
// Will be called when all item 'done' functions have been called.
cluster.worker.disconnect();
});
Node fs.appendFile( ... ) is an asynchronous function. So it expects us to pass a callback for we know it has finished its main operation, to inform us of some error occurred, or another purpose.
This means we need to call Node process.exit( ... ) in the scope of the provided callback. I've written this code to test:
'use strict';
var fs = require('fs');
function jsonValue(obj) {
return JSON.stringify(obj, null, '\t');
}
fs.appendFile('file.json', jsonValue(['t', 'e', 's', 't']), function(error) {
if (error) {
throw error;
}
console.log('success writing file'); // no error, so log...
process.exit(); // and exit right now
console.log('exited?'); // this will not be printed
});
Well, it worked as defined.
Other way it works is to use the synchronous version of fs.appendFile( ... ) and call process.exit() in a sequential way:
fs.appendFileSync('file.json', jsonValue(['t', 'e', 's', 't']));
console.log('success writing file'); // no error (I hope so =), so log...
process.exit(); // and exit right now
console.log('exited?'); // this will not be printed
This is clean code and works, but you lose the robustness and convenience gained with the callback...

Test Grunt Tasks

I am using Yeoman to generate some projects and also grunt-tasks.
Now I would also like to test the generated grunt tasks using Mocha, but I only find some information how to use Mocha tests in Grunt ;-)
Can anybody help?
Not an elegant solution but I took the approach of installing my dependencies (npm install) and consequently running the corresponding grunt task (for e.g. grunt less) and then writing test logic post that operation. I've used nested exec calls for this.
describe('less grunt tasks tests', function () {
var prompts = {
workFolder: 'temp',
fiddleDesc: 'mocha test'
};
var testGlobal = {};
beforeEach(function(done) {
testGlobal.app = helpers.run(path.join(__dirname, '../app'))
.inTmpDir(function(dir, err) {
if(err) { done(err); return; }
testGlobal.dir = dir;
// console.log(dir);
})
.withArguments(['skip-install'])
.withOptions({ less: true })
.withPrompts(prompts)
.on('end', function(){
done();
});
});
it('should modify app/styles/style.css', function(done){
this.timeout(60000 * 10); //10 minutes - my network is f**ked up
var opts = {
cwd : testGlobal.dir,
env: process.env,
detached: true
};
var gen = testGlobal.app.generator;
var devdeps = gen.devDependencies.join(' ');
var rootPath = testGlobal.dir;
var getPath = function(fpath) {
var s = path.join(rootPath, fpath);
// console.log(s); ;
return s;
};
exec('npm install ' + devdeps, opts, function(err, stdout, stderr) {
if(err) {
done(err);
return;
}
var h1 = fs.readFileSync(getPath('app/less/h1.less'), 'utf8');
var css = fs.readFileSync(getPath('app/styles/style.css'), 'utf8');
// expect(css).to.not.contain(h1);
expect(css).to.not.contain('h1');
exec('grunt less', opts, function(e, out, serr){
if(e) {
done(e);
return;
}
// console.log(out);
var h1 = fs.readFileSync(getPath('app/less/h1.less'), 'utf8');
var css = fs.readFileSync(getPath('app/styles/style.css'), 'utf8');
// expect(css).to.contain(h1); //this expect fails since for some reason \r are stripped out
expect(css).to.contain('h1');
done();
});
});
});
});
For more reference you can see more test code in the repo I contribute against.
Ps: I'd appreciate your comments on the approach I've taken.

Sequence of operation in Node.js

I took the example code from npmjs > jsdom. This process is performed in a few seconds and only after it I want to run a second action, such as console.log. But do not insert the code in the body of jsdom. Maybe it's work with Node.js > Stream
Whant to create a chain of functions, next process start as soon as the end of the previous.
Where i can read about sequence in Node.js?
var jsdom = require("jsdom");
jsdom.env({
url: "http://news.ycombinator.com/",
scripts: ["http://code.jquery.com/jquery.js"],
done: function (errors, window) {
var $ = window.$;
console.log("HN Links");
$("td.title:not(:last) a").each(function() {
console.log(" -", $(this).text());
});
}
});
console.log("The end");
You're looking for Async.js.
To be specific, you're looking for its series() functionality (Run an array of functions in series, each one running once the previous function has completed).
Code example (based on it's docs):
async.series([
function(callback){
jsdom.env({
url: "http://news.ycombinator.com/",
scripts: ["http://code.jquery.com/jquery.js"],
done: function (errors, window) {
var $ = window.$;
console.log("HN Links");
$("td.title:not(:last) a").each(function() {
console.log(" -", $(this).text());
});
callback(null, 'one');
}
});
},
function(callback){
// do some more stuff (second task) ...
callback(null, 'two');
}
],
// optional callback
function(err, results){
console.log("The end");
});

NodeJS async queue too fast (Slowing down async queue method)

I have an HTTP Get request and I want to parse the response and save it to my database.
If i call crawl(i) independentely i get good results. But i have to call crawl() from 1 to 2000.
I get good results but some responses seem to get lost and some responses are duplicates. I don't think I understand how to call thousands of asynchronous functions. I am using the async module queue function but so far I am still missing some data and still have some duplicates. What am I doing wrong here? Thanks for your help.
What i am crawling
My node functions :
function getOptions(i) {
return {
host: 'magicseaweed.com',
path: '/syndicate/rss/index.php?id='+i+'&unit=uk',
method: 'GET'
}
};
function crawl(i){
var req = http.request(getOptions(i), function(res) {
res.on('data', function (body) {
parseLocation(body);
});
});
req.end();
}
function parseLocation(body){
parser.parseString(body, function(err, result) {
if(result && typeof result.rss != 'undefined') {
var locationTitle = result.rss.channel[0].title;
var locationString = result.rss.channel[0].item[0].link[0];
var location = new Location({
id: locationString.split('/')[2],
name: locationTitle
});
location.save();
}
});
}
N = 2 //# of simultaneous tasks
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
q.drain = function() {
console.log('Crawling done.');
}
for(var i = 0; i < 100; i++){
q.push({url: 'http://magicseaweed.com/syndicate/rss/index.php?id='+i+'&unit=uk'});
}
[EDIT] WELL, after a lot of testing it seems that the service I am crawling cannot handle so many request that fast. Because when I do each requests sequentially, I can get all the good responses.
Is there a way to SLOW DOWN ASYNC queue method?
You should have a look at this great module, async which simplifies async tasks like this. You can use queue, simple example:
N = # of simultaneous tasks
var q = async.queue(function (task, callback) {
somehttprequestfunction(task.url, function(){
callback();
}
}, N);
q.drain = function() {
console.log('all items have been processed');
}
for(var i = 0; i < 2000; i++){
q.push({url:"http://somewebsite.com/"+i+"/feed/"});
}
It will have a window of ongoing actions and the tasks room will be available for a future task if you only invoke the callback function. Difference is, your code now opens 2000 connections immidiately and obviously the failure rate is high. Limiting it to a reasonable value, 5,10,20 (depends on site and connection) will result in a better sucess rate. If a request fails, you can always try it again, or push the task to another async queue for another trial. The key point is to invoke callback() in queue function, so that a room will be available when it is done.
var q = async.queue(function (task, callback) {
crawl(task.url);
callback();
}, N);
You'are executing next task immediately after starting the previous one, in this way, the queue is just meaningless. You should modify your code like this:
// first, modify your 'crawl' function to take a callback argument, and call this callback after the job is done.
// then
var q = async.queue(function (task, next/* name this argument as 'next' is more meaningful */) {
crawl(task.url, function () {
// after this one is done, start next one.
next();
});
// or, more simple way, crawl(task.url, next);
}, N);
Another option if you want. Vanilla JS without fancy libraries.
var incrementer = 0;
var resultsArray = [];
var myInterval = setInterval(function() {
incrementer++
if(incrementer == 100){
clearInterval(myInterval)
//when done parse results array
}
//make request here
//push request result to array here
}, 500);
Invokes the function every half second. Easy way to force sync and exit after x requests.
I know I am a little late to the question, however here is a solution I wrote to slow down the number of requests when testing an api endpoint, using node 4 or node 5:
var fs = require('fs');
var supertest = require('supertest');
var request = supertest("http://sometesturl.com/api/test/v1/")
var Helper = require('./check.helper');
var basicAuth = Helper.basicAuth;
var options = Helper.options;
fs.readFile('test.txt', function(err, data){
var parsedItems = JSON.parse(data);
var urlparts = []
// create a queue
for (let year of range(1975, 2016)) {
for (var make in parsedItems[year]){
console.log(year, make, '/models/' + year + '/' + make)
urlparts.push({urlpart:'/models/' + year + '/' + make, year: year, make: make})
}
}
// start dequeue
waitDequeue();
// This function calls itself after the makeRequest promise completes
function waitDequeue(){
var item = urlparts.pop()
if (item){
makeRequest(item)
.then(function(){
// wait this time before next dequeue
setTimeout(function() {
waitDequeue();
}, 3000);
})
} else {
write(parsedItems)
}
}
// make a request, mutate parsedItems then resolve
function makeRequest(item){
return new Promise((resolve, reject)=>{
request
.get(item.urlpart)
.set(options.auth[0], options.auth[1])
.set(options.type[0], options.type[1])
.end(function(err, res) {
if (err) return done1(err);
console.log(res.body)
res.body.forEach(function(model){
parsedItems[item.year][item.make][model] = {}
});
resolve()
})
})
}
// write the results back to the file
function write(parsedItems){
fs.writeFile('test.txt', JSON.stringify(parsedItems, null, 4), function(err){
console.log(err)
})
}
})
A little late but I have found this works!
Using async you can slow down the queue by using whilst inside the task handler eg:
var q = async.priorityQueue(function(task, callback) {
// your code process here for each task
//when ready to complete the task delay it by calling
async.whilst( //wait 6 seconds
function() {
return count < 10;
},
function(callback) {
count++;
setTimeout(function() {
callback(null, count);
}, 1000);
},
function (err, n) {
// n seconds have passed
callback(); //callback to q handler
}
); //whilst
} , 5);

Resources