I'm having a control flow problem with an application loading a large array of URLs. I'm using Caolan Async and the NPM request module.
My problem is that the HTTP response starts as soon as the function is added to the queue. Ideally I want to build my queue and only start making the HTTP requests when the queue starts. Otherwise the callbacks start firing before the queue starts - causing the queue to finish prematurely.
var request = require('request') // https://www.npmjs.com/package/request
, async = require('async'); // https://www.npmjs.com/package/async
var myLoaderQueue = []; // passed to async.parallel
var myUrls = ['http://...', 'http://...', 'http://...'] // 1000+ urls here
for(var i = 0; i < myUrls.length; i++){
myLoaderQueue.push(function(callback){
// Async http request
request(myUrls[i], function(error, response, html) {
// Some processing is happening here before the callback is invoked
callback(error, html);
});
});
}
// The loader queue has been made, now start to process the queue
async.parallel(queue, function(err, results){
// Done
});
Is there a better way of attacking this?
Using for loops combined with asynchronous calls is problematic (with ES5) and may yield unexpected results (in your case, the wrong URL being retrieved).
Instead, consider using async.map():
async.map(myUrls, function(url, callback) {
request(url, function(error, response, html) {
// Some processing is happening here before the callback is invoked
callback(error, html);
});
}, function(err, results) {
...
});
Given that you have 1000+ url's to retrieve, async.mapLimit() may also be worth considering.
If you're willing to start using Bluebird and Babel to utilize promises and ES7 async / await you can do the following:
let Promise = require('bluebird');
let request = Promise.promisify(require('request'));
let myUrls = ['http://...', 'http://...', 'http://...'] // 1000+ urls here
async function load() {
try {
// map myUrls array into array of request promises
// wait until all request promises in the array resolve
let results = await Promise.all(myUrls.map(request));
// don't know if Babel await supports syntax below
// let results = await* myUrls.map(request));
// print array of results or use forEach
// to process / collect them in any other way
console.log(results)
} catch (e) {
console.log(e);
}
}
I'm pretty confident you experiencing the results of a different error. By the time your queued functions are evaluating, i has been redefined, which might result in it appearing like you missed the first URLs. Try a little closure when you are queing the functions.
var request = require('request') // https://www.npmjs.com/package/request
, async = require('async'); // https://www.npmjs.com/package/async
var myLoaderQueue = []; // passed to async.parallel
var myUrls = ['http://...', 'http://...', 'http://...'] // 1000+ urls here
for(var i = 0; i < myUrls.length; i++){
(function(URLIndex){
myLoaderQueue.push(function(callback){
// Async http request
request(myUrls[URLIndex], function(error, response, html) {
// Some processing is happening here before the callback is invoked
callback(error, html);
});
});
})(i);
}
// The loader queue has been made, now start to process the queue
async.parallel(queue, function(err, results){
// Done
});
Related
I am learning async map . I want to download a bunch of URLs but i want to send a header along with the get request.
If i didn't have a header, I could have just done
var request = require('request');
var async = require('async');
var urls = ['http://myurl1.com', 'http://myurl2.com', 'http://myurl3.com'];
async.map(urls, request, function(err, results) {
if (err) throw(err); // handle error
console.log(results.length); // == urls.length
});
Now I do need to send a header {"x-url-key":"myurlkey"} along with every get request.
How do I modify the code above to do so?
That should be straightforward enough to do, we can create a wrapper function requestWithHeader to pass to async.map, this will specify whichever headers (or other options) you wish.
I'm also specifying json: true here, you may not want to do that in your actual code.
In this example I'm using https://httpbin.org/get as the url, this will send back all the request parameters which is useful for test purposes as we can see which headers we populated.
var request = require('request');
var async = require('async');
var urls = ["https://httpbin.org/get?foo=bar", "https://httpbin.org/get?foo=baz"];
function requestWithHeader(uri, callback) {
request(uri, { headers: {"x-url-key":"myurlkey"}, json:true }, callback)
}
async.map(urls, requestWithHeader, function(err, results) {
if (err) throw(err); // handle error
console.log("Results:", results.map(result => result.body));
});
To wait for async.map to finish you can create an asynchronous function to call it, e.g.
async function testMapWithPromise() {
try {
let results = await async.map(urls, requestWithHeader);
console.log("testMapWithPromise: Results:", results.map(result => result.body));
// Do whatever with results here...
} catch (error) {
console.error("testMapWithPromise: An error occurred:", error);
}
}
testMapWithPromise();
var urlArr = {
//ex url_for_site0 = 'https://www.google.com'
url_for_site0,
url_for_site1,
url_for_site2,
url_for_site3,
...
url_for_site50
};
urlArr.forEach(function(url, index) {
request(url, function(err, res, body) {
if(err) console.log(index+err);
else console.log(index+" success");
});
});
I got different unordered results and errors everytime I execute my app.
Example:
1 error : socket hang out
21 error : socket hang out
17 error : socket hang out
1 error : socket hang out
19 error : socket hang out
...(omission)
5 success
15 success
45 success
50 success
11 success
37 success
Everytime I get the results, they are in a different order.
Is this becasue I called too much request simultaneously?
When I request one by one, there's no error.
Example:
request(url_for_site0)
and restart program
request(url_for_site1)
and restart program
request(url_for_site2)
...
NodeJS events are all handled in a single pool and has a non-blocking nature. You can refer to the illustration below.
It happened to me once when I try to call multiple SQL queries. When I did it using C#, there is no problem at all. However, NodeJS gave me a similar behaviour to yours.
I am not sure if this is the best solution for the problem. However, here is how I fixed my problem with my SQL calls. I used the async waterfall function so that the whole process becomes synchronous. Each function will be run one by one with its return value piped to the next function. So, you can even do more stuffs. The usage of this library is not very staightforward, you can refer to this link to better help you understand how async waterfall works, then suit it to fit your solution.
https://gist.github.com/dineshsprabu/e6c1cf8f2ca100a8f5ae
Here is how I visualize your solution will roughly looks like:
var async = require('async');
async.waterfall(
[
function(callback) {
function_urlArr(url, index, function (returnVal) {
//Do something with the returnVal
callback(null, returnVal);
});
},
function(returnVal, callback) {
//the returnVal from first function gets passed here synchronously
function_urlArr(url2, index2, function (returnVal) {
//Do something with the returnVal
callback(null, returnVal);
});
},
function(returnVal, callback) {
//and so on ...
}
],
function (err) {
//console.log(err);
});
//define your function and enable callback
//you will need to include an extra third argument to receive the callback
function urlArr(url, index, callback) {
//your code
return callback(returnValue)
}
This is happening because of non-blocking nature of javascript.
If you want to make it happen one by one in order, you can use Async functions.
Socket hung up error may be because of the url you hit didn't respond anything after accepting the request.
You might have issue with non-blocking nature of loop forEach.
You can combine Promise and aysnc/await to make it blocking. Here is one way of handling it.
const request = require('request');
let urlArr = [
'https://localhost:9090',
'https://www.google.com',
'https://www.ebay.com',
'https://www.amazon.com',
];
//Creating promise for the requests.
let fetchPromise = function(url) {
return new Promise((resolve, reject) => {
request(url, (err, res, body) => {
if (err)
reject(Error(url + ' cannot be fetched'));
else
resolve(body);
});
}
);
};
//creating a blocking function
let fetchAllData = async function(urls) {
for (url of urls) { //using modern for loop instead for forEach
try {
data = await fetchPromise(url); // waiting until promise is resolved.
console.log('Recieved :' + data.length + 'bytes from ' + url);
} catch(e) {
console.log('Error :' + e); // catching error in case promise is rejected
}
}
};
//calling the function
fetchAllData(urlArr);
/*
// In case you want to wait until all promises are resolved.
// Then use Promise.all, however it will fail if any of the promise is rejected.
// One way to handle it would be to modify function fetchPromise such that it
// always resolves.
Promise
.all(urlArr.map(url => fetchPromise(url)))
.then(data => console.log(data))
.catch(err => console.log(err));
*/
I hope it helps.
How come this async/await doesn't work?
I've spent all day trying different combinations, watching videos and reading about async/await to find why this doesn't work before posting this here.
I'm trying to make a second nodejs app that will run on a different port, and my main app will call this so it scrap some data and save it to the db for cache.
What it's suppose to do:
Take a keyword and send it to a method called scrapSearch, this method create a complete URI link and send it to the method that actually get the webpage and returns it up to the first caller.
What is happening:
The console.log below the initial call is triggered before the results are returned.
Console output
Requesting : https://www.google.ca/?q=mykeyword
TypeError: Cannot read property 'substr' of undefined
at /DarkHawk/srv/NodesProjects/_scraper/node_scrapper.js:34:18
at <anonymous>
app.js:
'use strict';
var koa = require('koa');
var fs = require('fs');
var app = new koa();
var Router = require('koa-router');
var router = new Router();
app
.use(router.routes())
.use(router.allowedMethods());
app.listen(3002, 'localhost');
router.get('/scraptest', async function(ctx, next) {
var sfn = require('./scrap-functions.js');
var scrapFunctions = new sfn();
var html = await scrapFunctions.scrapSearch("mykeyword");
console.log(html.substr(0, 20));
//Normally here I'll be calling my other method to extract content
let json_extracted = scrapFunctions.exGg('mykeywords', html);
//Save to db
});
scrap-functions.js:
'use strict';
var request = require('request');
var cheerio = require('cheerio');
function Scraper() {
this.html = ''; //I tried saving html in here but the main script seems to have issues
retrieving that
this.kw = {};
this.tr = {};
}
// Search G0000000gle
Scraper.prototype.scrapSearch = async function(keyword) {
let url = "https://www.google.ca/?q="+keyword";
let html = await this.urlRequest(url);
return html;
};
// Get a url'S content
Scraper.prototype.urlRequest = async function(url) {
console.log("Requesting : "+url);
await request(url, await function(error, response, html) {
if(error) console.error(error);
return response;
});
};
module.exports = Scraper;
I tried a lot of things but I finally gave up - I tried putting await/async before each methods - didn't work either.
Why that isn't working?
Edit: wrong function name based on the fact that I created 2 different projects for testing and I mixed the file while copy/pasting.
You are not returning anything from urlRequest. Because it is an async function, it will still create a promise, but it will resolve with undefined. Therefore your html is undefined as seen in the error.
The problematic part is the request function which is a callback style function, but you're treating it as a promise. Using await on any value that is not a promise, won't do anything (technically it creates a promise that resolves directly with the value, but the resulting value remains the same). Both awaits within the urlRequest are unnecessary.
request(url, function(error, response, html) {
if(error) console.error(error);
// This return is for the callback function, not the outer function
return response;
});
You cannot return a value from within the callback. As it's asynchronous, your function will already have finished by the time the callback is called. With the callback style you would do the work inside the callback.
But you can turn it into a promise. You have to create a new promise and return it from urlRequest. Inside the promise you do the asynchronous work (request) and either resolve with the value (the response) or reject with the error.
Scraper.prototype.urlRequest = function(url) {
console.log("Requesting : "+url);
return new Promise((resolve, reject) => {
request(url, (err, response) => {
if (err) {
return reject(err);
}
resolve(response);
});
});
};
When an error occurred you want to return from the callback, so the rest (successful part) is not executed. I also removed the async keyword, because it's manually creating a promise.
If you're using Node 8, you can promisify the request function with the built-in util.promisify.
const util = require('util');
const request = require('request');
const requestPromise = util.promisify(request);
Scraper.prototype.urlRequest = function(url) {
console.log("Requesting : " + url);
return requestPromise(url);
};
Both versions will resolve with the response and to get the HTML you need to use response.body.
Scraper.prototype.scrapSearch = async function(keyword) {
let url = "https://www.google.ca/?q=" + keyword;
let response = await this.urlRequest(url);
return response.body;
};
You still need to handle errors from the promise, either with .catch() on the promise, or using try/catch when you await it.
It is absolutely essential to understand promises when using async/await, because it's syntactic sugar on top of promises, to make it look more like synchronous code.
See also:
Understand promises before you start using async/await
Async functions - making promises friendly
Exploring ES6 - Promises for asynchronous programming
I have an array of URLs and I want to loop through them and fetch thr content. After I have looped through them and fetched thr content I want a callback function to be called.
I know I can do this via async library but I want to do this without using any library.
Sample of what kind of code I want is below
['yahoo.com', 'gmail.com'].each(function(item){
//code to fetch URL content
},someCallbackFunctionToBeExecutedAtTheEndOfLoop);
This is typically the type of thing you do using promises (But you would need a library), with a code like:
var ops = [];
urls.forEach(function(url) {
ops.push(fetchUrl(url));
});
P.all(ops).then(callback);
function fetchUrl(url) {
var defer = P.defer();
//do stuff
// call defer.resolve(result);
return defer.promise;
}
If you don't want to use promises, you can use a counter of operations, like:
var ops = urls.length;
urls.forEach(function(url) {
// do stuff
ops--;
if (ops === 0) {
callback();
}
});
If you chose the promises, I advice to use p-promise module, which is far more optimized than Q.
If you want to do it without any sort of library like async, then you have to write your own counter to keep track of when all the async responses have been completed:
var request = require('request');
function loadAll(list, fn) {
var cnt = list.length;
var responses = [];
list.forEach(function(url, index) {
request(url, function(error, response, body) {
if (error) {
fn(error);
} else {
responses[index] = response;
--cnt;
if (cnt === 0) {
fn(0, responses);
}
}
});
})
}
loadAll(['http://www.yahoo.com', 'http://www.gmail.com'], function(err, results) {
if (!err) {
// process results array here
}
});
If you're going to be doing many async operations in node.js, then getting a promise library like Bluebird will save you a lot of time. For example, I think you could do the above in something like this (untested):
var Promise = require("bluebird");
var requestP = Promise.promisfy(require("request"));
Promise.map(['http://www.yahoo.com', 'http://www.gmail.com'], requestP).then(function(results) {
// process the array of results here
});
I have code like
common.findOne('list', {'listId': parseInt(request.params. istId)}, function(err, result){
if(err) {
console.log(err);
}
else {
var tArr = new Array();
if(result.tasks) {
var tasks = result.tasks;
for(var i in tasks) {
console.log(tasks[i]);
common.findOne('tasks', {'taskId':parseInt(tasks[i])}, function(err,res){
tArr[i] = res;
console.log(res);
});
}
console.log(tArr);
}
return response.send(result);
}
});
It is not executed sequentially in node.js so I get an empty array at the end of execution. Problem is it will first execute console.log(tArr); and then execute
common.findOne('tasks',{'taskId':parseInt(tasks[i])},function(err,res){
tArr[i] = res;
console.log(res);
});
Is there any mistake in my code or any other way for doing this.
Thanks!
As you are probably aware, things run asynchronously in node.js. So when you need to get things to run in a certain order you need to make use of a control library or basically implement it yourself.
I highly suggest you take a look at async, as it will easily allow you to do something like this:
var async = require('async');
// ..
if(result.tasks) {
async.forEach(result.tasks, processEachTask, afterAllTasks);
function processEachTask(task, callback) {
console.log(task);
common.findOne('tasks', {'taskId':parseInt(task)}, function(err,res) {
tArr.push(res); // NOTE: Assuming order does not matter here
console.log(res);
callback(err);
});
}
function afterAllTasks(err) {
console.log(tArr);
}
}
The main things to see here is that processEachTask gets called with each task, in parallel, so the order is not guaranteed. To mark that the task has been processed, you will call callback in the anonymous function from findOne. This allows you to do more async work in processEachTask but still manage to signify when it is done. When every task is done, it will then call afterAllTasks.
Take a look at async to see all the helper functions that it provides, it is very useful!
I've recently created a simple abstraction named "wait.for" to call async functions in sync mode (based on Fibers): https://github.com/luciotato/waitfor
Using wait.for and async your code will be:
var wait = require('waitfor');
...
//execute in a fiber
function handleRequest(request,response){
try{
...
var result = wait.for(common.findOne,'list',{'listId': parseInt(request.params.istId)});
var tArr = new Array();
if(result.tasks) {
var tasks = result.tasks;
for(var i in tasks){
console.log(tasks[i]);
var res=wait.for(common.findOne,'tasks',{'taskId':parseInt(tasks[i])});
tArr[i] = res;
console.log(res);
}
console.log(tArr);
return response.send(result);
};
....
}
catch(err){
// handle errors
return response.end(err.message);
}
};
// express framework
app.get('/posts', function(req, res) {
// handle request in a Fiber, keep node spinning
wait.launchFiber(handleRequest,req,res);
});