While building a fairly complex scraper i stumbled upon a problem with a control flow of my code.
What's going on in code below:
1) request a URL
2) scrape NEWURL from the results
3) pass it to readability API as first async function
4) here comes the trouble — i never get the next async function which saves readabilityData to DB
How to solve this problem?
I'm new to JS, so please feel free to point out at any issues with my code.
request(URL, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html);
NEWURL = data.find('a').attr('href');
readabilityData = {}
var articleUrl = 'https://readability.com/api/content/v1/parser?url=' + NEWURL + token;
async.series([
function(){
request(articleUrl, function(error, response, html) {
if (!error) {
readabilityData = response.toJSON();
}
});
},
function(readabilityData){
Article.findOne({
"link": url // here's the
}, function(err, link){
if(link) {
console.log(link)
} else {
var newArticle = new Article({
// write stuff to DB
});
newArticle.save(function (err, data) {
// save it
});
}
});
}
],
function(err){
console.log('all good — data written')
});
});
}
});
You need to call the callback parameter that's passed into the functions of the async.series call when each function's work is complete. That's how async.series knows that it can proceed to the next function. And don't redefine readabilityData as a function parameter when you're trying to use it to share data across the functions.
So something like:
var readabilityData = {};
async.series([
function(callback){
request(articleUrl, function(error, response, html) {
if (!error) {
readabilityData = response.toJSON();
}
callback(error);
});
},
function(callback){
Article.findOne({
"link": url // here's the
}, function(err, link){
if(link) {
console.log(link);
callback();
} else {
var newArticle = new Article({
// write stuff to DB
});
newArticle.save(function (err, data) {
// save it
callback(err);
});
}
});
}
],
function(err){
console.log('all good — data written')
});
Related
I have a function which uses async.parallel function to call two functions. But I don't know how to handle errors when async.parallel is used. If one of the two functions throw errors, how do I handle them?
exports.getProductconfigWithProductList = function(req, res){
var apiVersion = req.param('version', null);
var product_id = req.params.productId;
dbDriver = determineDriver('es');
async.parallel([
function(callback) {
dbDriver.getProductconfig(product_id,function(data){
callback(null, data);
});
},
function(callback) {
var productListPromise = getProductListData();
productListPromise.then(function(data) {
callback(null, data);
});
}
],
function(err, results) {
if(!err){
var data = results[0];
data[1] = results[1];
res.send(data);
res.end();
} else {
console.log('<<<<<<<<<<<<<<'+err+'>>>>>>>>>>>>>>>');
res.send(err);
res.end();
}
}
);
};
When you have:
async.parallel([
func1,
func2,
], (err, data) => {
if (err) {
// you handle the errors here
}
});
It's explained in more detail in the docs:
https://caolan.github.io/async/docs.html
and in the issue on GitHub here:
https://github.com/caolan/async/issues/334
var url = require('url');
var pug = require('pug');
var PouchDB = require('pouchdb');
var db = new PouchDB('http://127.0.0.1:5984/data');
var doc = {
"_id": "mittens",
"name": "Mittens",
};
function query() {db.get('mittens', function (error, doc) {
if (error) {
console.log('Ops! There is an error.');
} else {
console.log(doc);
return doc;
}
});
}
module.exports = {
handleRequests: function(request, response) {
response.writeHead(200, {'Content-Type': 'text/html'});
var path = url.parse(request.url).pathname;
console.log(path);
switch (path) {
case '/':
response.write(pug.renderFile('./views/index.pug', query()));
response.end();
break;
The query() function is returning an Object with "name". But it isn't rendered by pug.js.
Why pug.js do not render the Template with doc.name?
As stated in comments by #Paul you couldn't simply return a value from an asynchronous function call. you should use callbacks or promises:
The callback way:
function query(item, callback) {
db.get(item, function (error, doc) {
if (error) {
callback(error, null);
} else {
callback(null, doc);
}
});
}
Then:
case '/':
query('mittens', function(err, doc) {
if (err) throw err;
response.write(pug.renderFile('./views/index.pug', doc));
response.end();
}
break;
Read More: How do I return the response from an asynchronous call? And implement promise way if you prefer.
this.queryMailApi = function(mailUrl, callback) {
request.get({url: mailUrl}, function (err, httpResponse, body) {
if (err) {
return console.error('post failed:', err);
} else
callback(body);
});
};
this.myCallBack = function(data) {
var emailData = data;
console.log(emailData);
}
This is my function + callback to get the value. I want to return it to a function call similar to how you would do this.
var x = shared.queryMailApi(mailApiUrl, shared.myCallBack);
To be used later in code. I've read a ton of things about asynchronous Nodejs stuff which means I can't actually do this... but there has to be a way.
I didn't try this, but I think you should be able to do this in in this way with a promise.
this.queryMailApi = function(mailUrl) {
var deferred = protractor.promise.defer();
request.get({url: mailUrl}, function (err, httpResponse, body) {
if (err) {
deferred.reject(err);
return console.error('post failed:', err);
}
deferred.resolve(body);
});
return deferred.promise
};
this
.queryMailApi('example#mail.com')
.then(function(response) {
console.log(response);
});
If this doesn't work, you may take a look webdriver.WebDriver.wait. This may be useful.
The code I wrote so far is as below.
I don't need the whole response but just part of it.
var request = require('request');
var async = require('async');
var asyncTasks = [];
var install;
async.series([
function (callback) {
setTimeout(function () {
request('URL', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); // Show the HTML for the Google homepage.
}
});
}, 5000);
},
function (callback) {
setTimeout(function () {
request('URL', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); // Show the HTML for the Google homepage.
}
});
}, 5000);
}
],
function (error, results) {
console.log(results);
});
One approach to do the above concurrently would be to use async.parallel - of the form of:
async.parallel([
function(){ ... },
function(){ ... }
], callback);
Another approach is to use a Promises library - BlueBird or Q are good choices.
Q.All is of the form of:
return Q.all([
promise1,
promise2
]).spread(function (resultFromPromise1, resultFromPromise2) {
// do something with the results...
});
You could use one of these approaches to parallelise the two calls. The outputs of each will give you an array containing the results of each call respectively.
Here is a simple illustration of each approach:
Using Async.js
var async = require('async');
var task = function (cb, count) {
setTimeout(function () {
cb(null, "complete: " + count);
}, 1000);
};
async.parallel([
function (cb) {
task(cb, 'one');
},
function (cb) {
task(cb, 'two');
}
], function (err, results) {
console.log(results);
//[ 'complete: one', 'complete: two' ]
});
Using Q:
var Q = require('q');
function task1(cb, count) {
var deferred = Q.defer();
setTimeout(function () {
return deferred.resolve(cb(null, count));
}, 1000);
return deferred.promise;
}
var myCb = function (err, count) {
return "complete: " + count;
};
Q.all([task1(myCb, 'one'), task1(myCb, 'two')])
.then(function (results) {
console.log(results);
//[ 'complete: one', 'complete: two' ]
});
Let me know if you are unclear.
Promises are there to help you out in such a case.
I would prefer to use 'Q' library.
I have modified your code to use Q library
var Q = require('q');
var request = require('request');
function makeCall() {
Q.all([req1(), req2()])
.spread(function (res1, res2) {
// This block is executed once all the functions( Provided in Q.all() ) are finished its execution.
// Use responses from called functions
}, function (err) {
// Error, If any
});
}
function req1() {
var defer = Q.defer();
var url = ''; // Specify URL
var options = {
method: 'get', // Method to use
url: url
}
request(options, function (err, res, body) {
if (err) {
return defer.reject(err);
}
return defer.resolve(body);
});
return defer.promise;
}
function req2() {
var defer = Q.defer();
var url = ''; // Specify URL
var options = {
method: 'get', // Method to use
url: url
}
request(options, function (err, res, body) {
if (err) {
return defer.reject(err);
}
return defer.resolve(body);
});
return defer.promise;
}
You can find docs for Q library here : Q docs
I'm trying to wrap my head around the async library, but I'm pretty wobbly in NodeJs and I can't figure out async.parallel. The code below produces error TypeError: undefined is not a function on the line where the parallel tasks are to be executed. Am I correct in that tasks to be run in async.parallel should have a callback() when they are done? (irrelevant parts of the function are redacted)
function scrapeTorrents(url, callback) {
request(url, function(err, res, body) {
if(err) {
callback(err, null);
return;
}
var $ = cheerio.load(body);
var results = [];
var asyncTasks = [];
$('span.title').each(function(i, element){
// scrape basic info
var show = {title: info.title, year: info.year};
asyncTasks.push(
getOmdbInfo(show, function (err, res) {
if (res) {
omdbInfo = res;
results.push({
// add basic info and Omdb info
});
}
callback();
})
);
});
async.parallel(asyncTasks, function(){
callback(null, results);
});
});
}
In the section where you define async tasks, be sure to specify a closure with a parameter method to call once the task is complete (named differently than callback so as to avoid hoisting).
asyncTasks.push(
function (done) {
getOmdbInfo(show, function (err, res) {
if (err) {
return done(err);
}
if (res) {
omdbInfo = res;
results.push({
// add basic info and Omdb info
});
}
return done();
})
}
);