I'm crawling a lot of links with the request module in parallel with combination of the async module.
I'm noticing alot of ETIMEDOUT and ESOCKETTIMEDOUT errors although the links are reachable and respond fairly quickly using chrome.
I've limit the maxSockets to 2 and the timeout to 10000 in the request options.
I'm using async.filterLimit() with a limit of 2 to even cut down the parallelism to 2 request each time.
So I have 2 sockets, 2 request, and a timeout of 10 seconds to wait for headers response from the server yet I get these errors.
Here;s request configuration I use:
{
...
pool: {
maxSockets: 2
},
timeout: 10000
,
time: true
...
}
Here's the snippet of code I use to fecth links:
var self = this;
async.filterLimit(resources, 2, function(resource, callback) {
request({
uri: resource.uri
}, function (error, response, body) {
if (!error && response.statusCode === 200) {
...
} else {
self.emit('error', resource, error);
}
callback(...);
})
}, function(result) {
callback(null, result);
});
I listened to the error event and I see whenever the error code is ETIMEDOUT the connect object is either true/false so sometimes it's a connection timeout and sometimes it's not (according to request docs)
UPDATE:
I decided to boost up the maxSockets to Infinity so no connection will be hangup due to lack of available sockets:
pool: {
maxSockets: Infinity
}
In-order to control the bandwidth I implemented a requestLoop method that handle the request with a maxAttemps and retryDelay parameters to control the requests:
async.filterLimit(resources, 10, function(resource, callback) {
self.requestLoop({
uri: resource.uri
}, 100, 5000, function (error, response, body) {
var fetched = false;
if (!error) {
...
} else {
....
}
callback(...);
});
}, function(result) {
callback(null, result);
});
Implementation of requestLoop:
requestLoop = function(options, attemptsLeft, retryDelay, callback, lastError) {
var self = this;
if (attemptsLeft <= 0) {
callback((lastError != null ? lastError : new Error('...')));
} else {
request(options, function (error, response, body) {
var recoverableErrors = ['ESOCKETTIMEDOUT', 'ETIMEDOUT', 'ECONNRESET', 'ECONNREFUSED'];
var e;
if ((error && _.contains(recoverableErrors, error.code)) || (response && (500 <= response.statusCode && response.statusCode < 600))) {
e = error ? new Error('...');
e.code = error ? error.code : response.statusCode;
setTimeout((function () {
self.requestLoop(options, --attemptsLeft, retryDelay, callback, e);
}), retryDelay);
} else if (!error && (200 <= response.statusCode && response.statusCode < 300)) {
callback(null, response, body);
} else if (error) {
e = new Error('...');
e.code = error.code;
callback(e);
} else {
e = new Error('...');
e.code = response.statusCode;
callback(e);
}
});
}
};
So this to sum it up:
- Boosted maxSockets to Infinity to try overcome timeout error of sockets connection
- Implemnted requestLoop method to control failed request and maxAttemps as well as retryDelay of such requests
- Also there's maxium number of concurrent request set by the number passed to async.filterLimit
I want to note that I've also played with the settings of everything here in-order to get errors free crawling but so far attempts failed as-well.
Still looking for help about solving this problem.
UPDATE2:
I've decided to drop async.filterLimit and make my own limit mechanism.
I just have 3 variables to help me achieve this:
pendingRequests - a request array which will hold all requests (will explain later)
activeRequests - number of active requests
maxConcurrentRequests - number of maximum allowed concurrent requests
into the pendingRequests array, i push a complex object containing a reference to the requestLoop function as well as arguments array containing the arguments to be passed to the loop function:
self.pendingRequests.push({
"arguments": [{
uri: resource.uri.toString()
}, self.maxAttempts, function (error, response, body) {
if (!error) {
if (self.policyChecker.isMimeTypeAllowed((response.headers['content-type'] || '').split(';')[0]) &&
self.policyChecker.isFileSizeAllowed(body)) {
self.totalBytesFetched += body.length;
resource.content = self.decodeBuffer(body, response.headers["content-type"] || '', resource);
callback(null, resource);
} else {
self.fetchedUris.splice(self.fetchedUris.indexOf(resource.uri.toString()), 1);
callback(new Error('Fetch failed because a mime-type is not allowed or file size is bigger than permited'));
}
} else {
self.fetchedUris.splice(self.fetchedUris.indexOf(resource.uri.toString()), 1);
callback(error);
}
self.activeRequests--;
self.runRequest();
}],
"function": self.requestLoop
});
self.runRequest();
You'' notice the call to runRequest() at the end.
This function job is to manage the requests and fire requests when it can while keeping the maximum activeRequests under the limit of maxConcurrentRequests:
var self = this;
process.nextTick(function() {
var next;
if (!self.pendingRequests.length || self.activeRequests >= self.maxConcurrentRequests) {
return;
}
self.activeRequests++;
next = self.pendingRequests.shift();
next["function"].apply(self, next["arguments"]);
self.runRequest();
});
This should solve any Timeouts errors, through my testings tho, I've still noticed some timeouts in specific websites I've tested this on. I can't be 100% sure about this, but I'm thinking it's due to the nature of the website backing http-server limiting a user requests to a maximum by doing an ip-checking and as a result returning some HTTP 400 messages to prevent a possible 'attack' on the server.
Edit: duplicate of https://stackoverflow.com/a/37946324/744276
By default, Node has 4 workers to resolve DNS queries. If your DNS query takes long-ish time, requests will block on the DNS phase, and the symptom is exactly ESOCKETTIMEDOUT or ETIMEDOUT.
Try increasing your uv thread pool size:
export UV_THREADPOOL_SIZE=128
node ...
or in index.js (or wherever your entry point is):
#!/usr/bin/env node
process.env.UV_THREADPOOL_SIZE = 128;
function main() {
...
}
Edit 1: I also wrote a blog post about it.
Edit 2: if queries are non-unique, you may want to use a cache, like nscd.
I found if there are too many async requests, then ESOCKETTIMEDOUT exception happens in linux. The workaround I've found is doing this:
setting this options to request():
agent: false, pool: {maxSockets: 100}
Notice that after that, the timeout can be lying so you might need to increase it.
Related
I'm using the well know request module (https://github.com/request/request) that request some page and then I made some calculations and built the response code for user.
The thing is, that sometimes I receive the correct code (200) but the data (some kind of security measure?) is sending FOREVER - request won't stop. How can I kill it after some time of "inactivity"?
var requestOptions =
{
url : url,
timeout : 10000, //// here we set the request timeout - 10 seconds
headers : {'we can also set some headers' : 'sometimes the pages require a headers - there is no proper responce without those'}
};
var ourRequest = request(requestOptions, function(err, resp, body)
{
if (err || resp.statusCode >= 400) // OR just - (err || resp.statusCode !== 200)
{
console.log(err.connect);
if (resp)
{
console.log('ERROR at last there is response (code: ' + resp.statusCode + ')\n');
}
else
{
if ((err.code === 'ETIMEDOUT') && (err.connect === true))
{
//// when there's a timeout and connect is true, we're meeting the conditions described for the timeout option where the OS governs
console.log('REQUEST TIMEOUT - SYSTEM FAULT!');
}
else if ((err.code === 'ESOCKETTIMEDOUT') || (err.code === 'ETIMEDOUT'))
{
console.log('REQUEST TIMEOUT - NODE EXPRESS/REQUEST FAULT!');
}
console.log('ERROR you cant ge the body of the page...');
}
}
if (resp.statusCode >= 300 && resp.statusCode < 400)
{
console.log('page REDIRECT');
}
if (!err && resp.statusCode == 200)
{
console.log('ok, we get the body, lets do something with it');
console.log('some async code example to send part of the data to other servers, etc.');
}
});
It's a internal server when I get the data from - it sometimes online, and sometimes completely offline so I discover that some pages have a similar "functionalities" like for example validator. As a url set up some page and add 'https://validator.w3.org/nu/?doc=' before it, make few requests to see that sometimes it hang up, and timeout is not working on them (probably cause the receive correct status code).
How can I kill the request, after 10 seconds, even if it receive proper status code? Thanks for any thoughts on this subject.
try this:
resp.request.abort();
I am working on a crawler. I have a list of URL need to be requested. There are several hundreds of request at the same time if I don't set it to be async. I am afraid that it would explode my bandwidth or produce to much network access to the target website. What should I do?
Here is what I am doing:
urlList.forEach((url, index) => {
console.log('Fetching ' + url);
request(url, function(error, response, body) {
//do sth for body
});
});
I want one request is called after one request is completed.
You can use something like Promise library e.g. snippet
const Promise = require("bluebird");
const axios = require("axios");
//Axios wrapper for error handling
const axios_wrapper = (options) => {
return axios(...options)
.then((r) => {
return Promise.resolve({
data: r.data,
error: null,
});
})
.catch((e) => {
return Promise.resolve({
data: null,
error: e.response ? e.response.data : e,
});
});
};
Promise.map(
urls,
(k) => {
return axios_wrapper({
method: "GET",
url: k,
});
},
{ concurrency: 1 } // Here 1 represents how many requests you want to run in parallel
)
.then((r) => {
console.log(r);
//Here r will be an array of objects like {data: [{}], error: null}, where if the request was successfull it will have data value present otherwise error value will be non-null
})
.catch((e) => {
console.error(e);
});
The things you need to watch for are:
Whether the target site has rate limiting and you may be blocked from access if you try to request too much too fast?
How many simultaneous requests the target site can handle without degrading its performance?
How much bandwidth your server has on its end of things?
How many simultaneous requests your own server can have in flight and process without causing excess memory usage or a pegged CPU.
In general, the scheme for managing all this is to create a way to tune how many requests you launch. There are many different ways to control this by number of simultaneous requests, number of requests per second, amount of data used, etc...
The simplest way to start would be to just control how many simultaneous requests you make. That can be done like this:
function runRequests(arrayOfData, maxInFlight, fn) {
return new Promise((resolve, reject) => {
let index = 0;
let inFlight = 0;
function next() {
while (inFlight < maxInFlight && index < arrayOfData.length) {
++inFlight;
fn(arrayOfData[index++]).then(result => {
--inFlight;
next();
}).catch(err => {
--inFlight;
console.log(err);
// purposely eat the error and let the rest of the processing continue
// if you want to stop further processing, you can call reject() here
next();
});
}
if (inFlight === 0) {
// all done
resolve();
}
}
next();
});
}
And, then you would use that like this:
const rp = require('request-promise');
// run the whole urlList, no more than 10 at a time
runRequests(urlList, 10, function(url) {
return rp(url).then(function(data) {
// process fetched data here for one url
}).catch(function(err) {
console.log(url, err);
});
}).then(function() {
// all requests done here
});
This can be made as sophisticated as you want by adding a time element to it (no more than N requests per second) or even a bandwidth element to it.
I want one request is called after one request is completed.
That's a very slow way to do things. If you really want that, then you can just pass a 1 for the maxInFlight parameter to the above function, but typically, things would work a lot faster and not cause problems by allowing somewhere between 5 and 50 simultaneous requests. Only testing would tell you where the sweet spot is for your particular target sites and your particular server infrastructure and amount of processing you need to do on the results.
you can use set timeout function to process all request within loop. for that you must know maximum time to process a request.
I need to check the availability of about 300.000 URLs on a local server via HTTP. The files are not in a local file system but a key value store and the goal is to sanity check if every system needing access to those files is able to do so vial HTTP.
To do so, I would use HTTP HEAD requests that return HTTP 200 for every file found and 404 for every file not found.
The problem is, if I do too many requests at once, I get rate limited by nginx or a local proxy, hence no info whether a file is really accessible.
My method to look for the availability of files looks as follows:
...
const request = require('request'); // Using the request lib.
...
const checkEntity = entity => {
logger.debug("HTTP HEAD ", entity);
return request({ method: "HEAD", uri: entity.url })
.then(result => {
logger.debug("Successfully retrieved file: " + entity.url);
entity.valid = result != undefined;
})
.catch(err => {
logger.debug("Failed to retrieve file.", err);
entity.valid = false;
});
}
If I call this function a few times, things work as expected. When trying to run it within recursive promises, I quickly exceed the maximum stack. Setting up one promise for each call causes too much memory usage.
How could this be solved?
This problem can be solved in these steps:
Define a queue and store all your entities (all URLs that need to be checked).
Define how many HTTP requests you want to send in parallel. This number should not be too small or too large. If it's too small, the program is not efficient. If it is too large, current requests-number-limit problem will occur. Let's make it as N, you can define a reasonable number according to your server status.
Send N HTTP requests in parallel at the beginning.
When 1 request is finished, fetch a new entity from the queue and send a new request. To get notified when request is done, you can add a callback parameter in your checkEntity function.
In this way, the maximum HTTP requests number will never be more than N.
Here is a pseudo code example based on your code snippet:
let allEntities = [...]; // 300000 URLs
let finishedEntities = [];
const request = require('request'); // Using the request lib.
...
const checkEntity = function(entity, callback) {
logger.debug("HTTP HEAD ", entity);
return request({ method: "HEAD", uri: entity.url })
.then(result => {
logger.debug("Successfully retrieved file: " + entity.url);
entity.valid = result != undefined;
callback(entity);
})
.catch(err => {
logger.debug("Failed to retrieve file.", err);
entity.valid = false;
callback(entity)
});
}
function checkEntityCallback(entity) {
finishedEntities.push(entity);
let newEntity = allEntities.shift();
if (newEntity) {
checkEntity(allEntities.shift(), checkEntityCallback);
}
}
for (let i=0; i<10; i++) {
checkEntity(allEntities.shift(), checkEntityCallback);
}
To make things easier to understand, you can change the usage of request and remove all Promise stuff:
const checkEntity = function(entity, callback) {
logger.debug("HTTP HEAD ", entity);
request({ method: "HEAD", uri: entity.url }, function(error, response, body) {
if (error) {
logger.debug("Failed to retrieve file.", error);
entity.valid = false;
callback(entity);
return;
}
logger.debug("Successfully retrieved file: " + entity.url);
entity.valid = body != undefined;
callback(entity);
});
}
I am working on the Node.js web application to check the status code of the websites running, I have been able to get the status code and get the response, and catch but I need to handle the error through the event handler.
In this code am not able to fetch the status code value in one of the variable as am not able get the value outside of the http.request function and store that in a variable, there are other questions that are related to access the variable using callbacks bit didn't help me when I tried the same.
var http = require('http');
var statcode;
This is the below code in the handlebar helper function that I am calling to test the particular site
sitefunction: function ()
{
var options = {
method: 'HEAD',
port: 84,
host: 'example-site',
path:'/XYZ/'
};
var req = http.request(options, function(res) {
// n1 = 7;
});
statcode = req.statusCode;
req.on('error', function(err) {
if(err.code == 'ENOTFOUND')
{
return 'Server Unreachable'
}
else
{
//console.log(err);
}
});
if (statcode == 200)
return 'Website Running fine'+statcode+'';
else if(statcode == 304)
return 'Website Running fine';
else if (statcode == 0)
return 'Status code 0'
else
return "Please check the http code "+statcode+" if any error";
//return n1;
}
For the above code am getting error unable to read the property for the req.StatusCode, whereas with the function below without the error handler am getting the desired response code, but the program terminates with an error when the connection error occurs.
Here is the below code that works without error handler:
In this code am able to fetch the status code and store it a variable statcode but I need to handle the error when the host is not reachable else the program terminates.
txyzfunction: function ()
{
var options = {
method: 'HEAD',
port: 84,
host: 'example-site2',
path:'/xyz/'
};
http.request(options, function(res) {
// n1 = 7;
});
statcode = res.statusCode
if ( statcode == 200)
return 'Website Running fine';
else if( statcode == 304)
return 'Website Running fine';
else if ( statcode == 0)
return 'Status code 0'
else
return "Please check the http code "+ statcode+" if any error";
//return n1;
}
Is there a clean way to read the status code and handle the error event as well? With the second function code, I can't handle the error event, as with the function code am not able to read the status code properly.
I have a weird problem. If I call this code to make an http request in the main execution line:
var request = require('request');
request('http://www.google.com', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Print the google web page.
}
})
The Google page HTML is printed, as expected.
However, I am doing a batch download script/crawler, so I am parsing a very large JSON file and then performing a request for each of the URLs I produce from that file.
To do the parsing, I am using the JSONStream parser. Here is the code:
parser.on('data', function (obj) {
console.log("Found uri");
console.log(obj);
});
The code is being run correctly, as the URI's are being printed in my console.
However, if I make the request inside the parsing block, the request callback is never executed.... Here is the code:
parser.on('data', function (obj) {
console.log("Found uri");
console.log(obj);
var identifierArray = obj['dc.identifier'];
if(identifierArray != null && identifierArray instanceof Array)
{
for(var i = 0; i < identifierArray.length; i++)
{
var dryadIdentifier = identifierArray[i];
if(dryadIdentifier.indexOf("dryad") != -1)
{
var fullUrl = "http://datadryad.org/resource/"+dryadIdentifier+"/mets.xml"
//var fileDestination = __dirname +"/"+downloadSubDir+"/"+dryadIdentifier.replace("/","_")+".xml"
var fileDestination = __dirname +"/"+downloadSubDir+"/"+fileCounter+".xml";
fileCounter++;
console.log("Sending request to "+ fullUrl + " ...");
//REQUEST SENT HERE; SAME CODE AS ABOVE.
var request = require('request');
request('http://www.google.com', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Print the google web page.
}
})
sleep.usleep(500000); //dont hammer the server
}
}
}
});
The log shows
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.s737f/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.s737f/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.1fd83/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.1fd83/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.4vk6d/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.c3k8m/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.5410v/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.492r0/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.m6g1b/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.m6g1b/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.4dm30/mets.xml ...
But no html is printed (it should print the google homepage many times, as I am not using the url's I parse from the json yet, to rule out problems with the intended server.
Sorry for the long letter, but I am at a loss at this behaviour (still learning nodejs... :-O)
It seems that the issue was related to the "sleep" call, so I implemented a basic connection queue with the semaphore library. I now specify a maximum of 10 simultaneous connections, here is my code:
var makeRequestAndSaveToFile = function(url, absolutePath)
{
sem.take(function(){
console.log("Sending request to "+ url + " ... and saving to file "+absolutePath);
request(url, function(error,response, body) {
if (!error && response.statusCode == 200) {
fs.writeFile(absolutePath, body, function(err) {
sem.leave();
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
}
});
});
}
I call this function for each link I want to download.
Note that this will not handle big downloads as there is no piping, and the links will be downloaded in an unorderly fashion like Slavo said in his comments.