I'm using the well know request module (https://github.com/request/request) that request some page and then I made some calculations and built the response code for user.
The thing is, that sometimes I receive the correct code (200) but the data (some kind of security measure?) is sending FOREVER - request won't stop. How can I kill it after some time of "inactivity"?
var requestOptions =
{
url : url,
timeout : 10000, //// here we set the request timeout - 10 seconds
headers : {'we can also set some headers' : 'sometimes the pages require a headers - there is no proper responce without those'}
};
var ourRequest = request(requestOptions, function(err, resp, body)
{
if (err || resp.statusCode >= 400) // OR just - (err || resp.statusCode !== 200)
{
console.log(err.connect);
if (resp)
{
console.log('ERROR at last there is response (code: ' + resp.statusCode + ')\n');
}
else
{
if ((err.code === 'ETIMEDOUT') && (err.connect === true))
{
//// when there's a timeout and connect is true, we're meeting the conditions described for the timeout option where the OS governs
console.log('REQUEST TIMEOUT - SYSTEM FAULT!');
}
else if ((err.code === 'ESOCKETTIMEDOUT') || (err.code === 'ETIMEDOUT'))
{
console.log('REQUEST TIMEOUT - NODE EXPRESS/REQUEST FAULT!');
}
console.log('ERROR you cant ge the body of the page...');
}
}
if (resp.statusCode >= 300 && resp.statusCode < 400)
{
console.log('page REDIRECT');
}
if (!err && resp.statusCode == 200)
{
console.log('ok, we get the body, lets do something with it');
console.log('some async code example to send part of the data to other servers, etc.');
}
});
It's a internal server when I get the data from - it sometimes online, and sometimes completely offline so I discover that some pages have a similar "functionalities" like for example validator. As a url set up some page and add 'https://validator.w3.org/nu/?doc=' before it, make few requests to see that sometimes it hang up, and timeout is not working on them (probably cause the receive correct status code).
How can I kill the request, after 10 seconds, even if it receive proper status code? Thanks for any thoughts on this subject.
try this:
resp.request.abort();
Related
I use node.js to make requests on google.
I used puppeteer but he detected my robot.
so i used tor-request.
I can change my ip at each connection.
But google still hangs with the same error message.
I want to change the user-agent of the tor-expert bundle.
How to do ?
Our systems have detected unusual traffic from your computer network.
Please try your request again later. Why did this happen?
IP address: 109.70.100.** Time: 2019-08-03T08:47:17Z URL:
https://www.google.com/search?q=re&gbv=1&sei=6UlFXY6mKsSiab3EjbAF
var tr = require('tor-request');
tr.TorControlPort.password = "***";
io.sockets.on('connection', async(socket) => {
socket.on('key', function(value) {
function torIp() {
tr.request('https://www.google.com/search?q=re&gbv=1&sei=6UlFXY6mKsSiab3EjbAF', function(error, response, body) {
if (!error && response.statusCode == 200) {
console.log("Your public IP is: " + body);
}
socket.emit('google_web', body)
});
}
torIp();
tr.renewTorSession(function(error, msg) {
console.log(error);
console.log(msg);
if (msg) {
torIp();
}
});
})
})
To change the user agent, you can use the method described here, a tor-request issue:
tr.request({
url: 'https://www.google.com/search?q=re&gbv=1&sei=6UlFXY6mKsSiab3EjbAF',
headers: {
'user-agent': 'giraffe'
}
}, function(error, response, body) {
// ...
});
Just replace giraffe with whichever user agent you wish.
(But, as the comment noted, Google may well have blocked certain Tor exit nodes completely, in which case changing the user-agent won't help)
I am working on the Node.js web application to check the status code of the websites running, I have been able to get the status code and get the response, and catch but I need to handle the error through the event handler.
In this code am not able to fetch the status code value in one of the variable as am not able get the value outside of the http.request function and store that in a variable, there are other questions that are related to access the variable using callbacks bit didn't help me when I tried the same.
var http = require('http');
var statcode;
This is the below code in the handlebar helper function that I am calling to test the particular site
sitefunction: function ()
{
var options = {
method: 'HEAD',
port: 84,
host: 'example-site',
path:'/XYZ/'
};
var req = http.request(options, function(res) {
// n1 = 7;
});
statcode = req.statusCode;
req.on('error', function(err) {
if(err.code == 'ENOTFOUND')
{
return 'Server Unreachable'
}
else
{
//console.log(err);
}
});
if (statcode == 200)
return 'Website Running fine'+statcode+'';
else if(statcode == 304)
return 'Website Running fine';
else if (statcode == 0)
return 'Status code 0'
else
return "Please check the http code "+statcode+" if any error";
//return n1;
}
For the above code am getting error unable to read the property for the req.StatusCode, whereas with the function below without the error handler am getting the desired response code, but the program terminates with an error when the connection error occurs.
Here is the below code that works without error handler:
In this code am able to fetch the status code and store it a variable statcode but I need to handle the error when the host is not reachable else the program terminates.
txyzfunction: function ()
{
var options = {
method: 'HEAD',
port: 84,
host: 'example-site2',
path:'/xyz/'
};
http.request(options, function(res) {
// n1 = 7;
});
statcode = res.statusCode
if ( statcode == 200)
return 'Website Running fine';
else if( statcode == 304)
return 'Website Running fine';
else if ( statcode == 0)
return 'Status code 0'
else
return "Please check the http code "+ statcode+" if any error";
//return n1;
}
Is there a clean way to read the status code and handle the error event as well? With the second function code, I can't handle the error event, as with the function code am not able to read the status code properly.
I'm crawling a lot of links with the request module in parallel with combination of the async module.
I'm noticing alot of ETIMEDOUT and ESOCKETTIMEDOUT errors although the links are reachable and respond fairly quickly using chrome.
I've limit the maxSockets to 2 and the timeout to 10000 in the request options.
I'm using async.filterLimit() with a limit of 2 to even cut down the parallelism to 2 request each time.
So I have 2 sockets, 2 request, and a timeout of 10 seconds to wait for headers response from the server yet I get these errors.
Here;s request configuration I use:
{
...
pool: {
maxSockets: 2
},
timeout: 10000
,
time: true
...
}
Here's the snippet of code I use to fecth links:
var self = this;
async.filterLimit(resources, 2, function(resource, callback) {
request({
uri: resource.uri
}, function (error, response, body) {
if (!error && response.statusCode === 200) {
...
} else {
self.emit('error', resource, error);
}
callback(...);
})
}, function(result) {
callback(null, result);
});
I listened to the error event and I see whenever the error code is ETIMEDOUT the connect object is either true/false so sometimes it's a connection timeout and sometimes it's not (according to request docs)
UPDATE:
I decided to boost up the maxSockets to Infinity so no connection will be hangup due to lack of available sockets:
pool: {
maxSockets: Infinity
}
In-order to control the bandwidth I implemented a requestLoop method that handle the request with a maxAttemps and retryDelay parameters to control the requests:
async.filterLimit(resources, 10, function(resource, callback) {
self.requestLoop({
uri: resource.uri
}, 100, 5000, function (error, response, body) {
var fetched = false;
if (!error) {
...
} else {
....
}
callback(...);
});
}, function(result) {
callback(null, result);
});
Implementation of requestLoop:
requestLoop = function(options, attemptsLeft, retryDelay, callback, lastError) {
var self = this;
if (attemptsLeft <= 0) {
callback((lastError != null ? lastError : new Error('...')));
} else {
request(options, function (error, response, body) {
var recoverableErrors = ['ESOCKETTIMEDOUT', 'ETIMEDOUT', 'ECONNRESET', 'ECONNREFUSED'];
var e;
if ((error && _.contains(recoverableErrors, error.code)) || (response && (500 <= response.statusCode && response.statusCode < 600))) {
e = error ? new Error('...');
e.code = error ? error.code : response.statusCode;
setTimeout((function () {
self.requestLoop(options, --attemptsLeft, retryDelay, callback, e);
}), retryDelay);
} else if (!error && (200 <= response.statusCode && response.statusCode < 300)) {
callback(null, response, body);
} else if (error) {
e = new Error('...');
e.code = error.code;
callback(e);
} else {
e = new Error('...');
e.code = response.statusCode;
callback(e);
}
});
}
};
So this to sum it up:
- Boosted maxSockets to Infinity to try overcome timeout error of sockets connection
- Implemnted requestLoop method to control failed request and maxAttemps as well as retryDelay of such requests
- Also there's maxium number of concurrent request set by the number passed to async.filterLimit
I want to note that I've also played with the settings of everything here in-order to get errors free crawling but so far attempts failed as-well.
Still looking for help about solving this problem.
UPDATE2:
I've decided to drop async.filterLimit and make my own limit mechanism.
I just have 3 variables to help me achieve this:
pendingRequests - a request array which will hold all requests (will explain later)
activeRequests - number of active requests
maxConcurrentRequests - number of maximum allowed concurrent requests
into the pendingRequests array, i push a complex object containing a reference to the requestLoop function as well as arguments array containing the arguments to be passed to the loop function:
self.pendingRequests.push({
"arguments": [{
uri: resource.uri.toString()
}, self.maxAttempts, function (error, response, body) {
if (!error) {
if (self.policyChecker.isMimeTypeAllowed((response.headers['content-type'] || '').split(';')[0]) &&
self.policyChecker.isFileSizeAllowed(body)) {
self.totalBytesFetched += body.length;
resource.content = self.decodeBuffer(body, response.headers["content-type"] || '', resource);
callback(null, resource);
} else {
self.fetchedUris.splice(self.fetchedUris.indexOf(resource.uri.toString()), 1);
callback(new Error('Fetch failed because a mime-type is not allowed or file size is bigger than permited'));
}
} else {
self.fetchedUris.splice(self.fetchedUris.indexOf(resource.uri.toString()), 1);
callback(error);
}
self.activeRequests--;
self.runRequest();
}],
"function": self.requestLoop
});
self.runRequest();
You'' notice the call to runRequest() at the end.
This function job is to manage the requests and fire requests when it can while keeping the maximum activeRequests under the limit of maxConcurrentRequests:
var self = this;
process.nextTick(function() {
var next;
if (!self.pendingRequests.length || self.activeRequests >= self.maxConcurrentRequests) {
return;
}
self.activeRequests++;
next = self.pendingRequests.shift();
next["function"].apply(self, next["arguments"]);
self.runRequest();
});
This should solve any Timeouts errors, through my testings tho, I've still noticed some timeouts in specific websites I've tested this on. I can't be 100% sure about this, but I'm thinking it's due to the nature of the website backing http-server limiting a user requests to a maximum by doing an ip-checking and as a result returning some HTTP 400 messages to prevent a possible 'attack' on the server.
Edit: duplicate of https://stackoverflow.com/a/37946324/744276
By default, Node has 4 workers to resolve DNS queries. If your DNS query takes long-ish time, requests will block on the DNS phase, and the symptom is exactly ESOCKETTIMEDOUT or ETIMEDOUT.
Try increasing your uv thread pool size:
export UV_THREADPOOL_SIZE=128
node ...
or in index.js (or wherever your entry point is):
#!/usr/bin/env node
process.env.UV_THREADPOOL_SIZE = 128;
function main() {
...
}
Edit 1: I also wrote a blog post about it.
Edit 2: if queries are non-unique, you may want to use a cache, like nscd.
I found if there are too many async requests, then ESOCKETTIMEDOUT exception happens in linux. The workaround I've found is doing this:
setting this options to request():
agent: false, pool: {maxSockets: 100}
Notice that after that, the timeout can be lying so you might need to increase it.
Overview
I'm developing an MVC application with NodeJS. When the application loads for the first time, the database object (using a pool) is created.
var pool = mysql.createPool({connectionLimit: 150, host: __host,
user: __user, password: __password,
database: __database})
module.exports = pool
When a request is received, a Controller object is created, which creates a Model to perform actions. The model gets a connection from the pool, performs the action, and releases the connection back to the pool.
//router snippet
router.get('/post_data', function(req, res){
router.setRequestAndResponse(req, res)
var post_data = new Post_Data()
post_data.processDataFromGet(router)
})
//controller code snippet
Post_Data_Controller.prototype.processDataFromGet = function(router){
var controller_obj = this
var data_array = {}
var req = router.req, res = router.res
//retrieving data from request and passing to the data_array
controller_obj.model.create(data_array, function(result){
var xml = xmlbuilder.create("response")
if (result.code == "error"){
xml.e("code", "error")
xml.e("message", result.error_message)
}else if (result.code == "success"){
xml.e("code", "success")
}
controller_obj.sendResponse(router.res, xml, "xml")
})
}
Post_Data_Controller.prototype.sendResponse = function(res, response, type){
if (type == "json"){
res.set({"Content-Type": "application/json", "Content-Length": JSON.stringify(response).length})
res.send(response)
}else{ /* Default type is XML */
res.set({"Content-Type": "application/xml", "Content-Length": response.end({pretty: true}).length})
res.send(response.end({pretty: true}))
}
}
//Model snippet
Post_Data.prototype.create = function(data_array, callback){
/* data validation */
var fail = false, error_data = {}
if (fail) {callback({code: "fail", cause: error_data}); return;}
//the next 2 lines do not throw an error when uncommented
//callback({code: "fail", cause: "how's it going"});
//return;
__db_pool.getConnection(function(err, db_conn){
// the next two lines throw an error for two or more requests coming in at the same time
callback({code: "fail", cause: "how's it going"});
return;
if (err) { callback({code: "error", error_message: err}); return;}
callback({code: "fail", cause: "how's it going"});
return;
db_conn.query("sql command", [data_array],
function(err, result){
if (err){ callback({code: "error", error_message: err}); return;}
if (result && result.length > 0){ //affiliate and listing exist
data_array.listing_id = result[0].listings_id
var data = [data_to_insert]
db_conn.query("sql command here", data,
function(err, result){
db_conn.release()
if (err){ callback({code: "error", error_message: err}); return;}
if (result && result.affectedRows > 0) {
callback({code: "success", data: {data_to_be_returned}})
}else {callback({code: "error", error_message:"Error inserting data"}); return}
})
}else{
callback({code: "fail", cause: "error to send back"})}
})
})
}
Problem
These requests are web service requests. If I send one GET request, no error happens; however, when I send two or more concurrent requests, I receive this error:
/project_path/node_modules/mysql/lib/protocol/Parser.js:82
throw err;
^
Error: Can't set headers after they are sent.
at ServerResponse.OutgoingMessage.setHeader (http.js:689:11)
at ServerResponse.res.set.res.header (/project_path/node_modules/express/lib/response.js:549:10)
I traced the culprit to the specific line in the Model code pasted above. It seems that for some reason, once the model obtains a connection from the pool for the second request, it somehow interferes with the first request. Both requests still insert the proper data into the database; however, the second and subsequent requests can't send a response without throwing an error anymore.
I have performed the requests with GET, POST, and PUT content-types; only GET throws the error. All the other content-types don't throw any error, even with over one thousand concurrent requests.
Here's the web service code for the GET requests; it's the same for the other content-types except for the content-type changes and the data being put in the body.
for(var i=0; i less than 5; i++){
sendAsGet()
i++
}
function sendAsGet(){
try{
var data = "?data_to_be_sent"
var uri =url.parse("http://localhost:4000/post_data")
var options = {hostname: uri.hostname, port: uri.port, method: "GET",
path: uri.path + data, agent: false}
request = (uri.protocol == "https")? https : http
var req = request.request(options, function(res){
var result = ""
console.log("STATUS: " + res.statusCode)
console.log("HEADERS: " + JSON.stringify(res.headers))
res.setEncoding("utf8")
res.setTimeout(50, null)
res.on("data", function(chunk){
result += chunk
})
res.on("end", function(){
console.log(result)
})
})
req.end()
}catch(err){
console.log(err.message)
}
}
I would like to know 2 things:
Why is getting the database connection causing this problem?
Why does it happen only on GET requests and not on POST and PUT?
Google and previous SO questions haven't been able to help so far.
Thanks.
The reason you are seeing the error is because you're placing request/response instances on the router itself. Don't do that. The router object is a "static" object, it's not a per-request thing. So currently this is what's happening (in order):
Request #1 comes in and sets req/res on router and starts the asynchronous model.create().
Meanwhile, request #2 comes in and overwrites req/res on router and starts its own asynchronous model.create().
Request #1's model.create() callback is called, sending the response to request #2's socket instead.
Request #2's model.create() callbacks is called, where it attempts to send a response to the same res that was just responded to just a moment ago. Trying to write headers to a response that has already been sent then results in the error you are seeing.
I have a weird problem. If I call this code to make an http request in the main execution line:
var request = require('request');
request('http://www.google.com', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Print the google web page.
}
})
The Google page HTML is printed, as expected.
However, I am doing a batch download script/crawler, so I am parsing a very large JSON file and then performing a request for each of the URLs I produce from that file.
To do the parsing, I am using the JSONStream parser. Here is the code:
parser.on('data', function (obj) {
console.log("Found uri");
console.log(obj);
});
The code is being run correctly, as the URI's are being printed in my console.
However, if I make the request inside the parsing block, the request callback is never executed.... Here is the code:
parser.on('data', function (obj) {
console.log("Found uri");
console.log(obj);
var identifierArray = obj['dc.identifier'];
if(identifierArray != null && identifierArray instanceof Array)
{
for(var i = 0; i < identifierArray.length; i++)
{
var dryadIdentifier = identifierArray[i];
if(dryadIdentifier.indexOf("dryad") != -1)
{
var fullUrl = "http://datadryad.org/resource/"+dryadIdentifier+"/mets.xml"
//var fileDestination = __dirname +"/"+downloadSubDir+"/"+dryadIdentifier.replace("/","_")+".xml"
var fileDestination = __dirname +"/"+downloadSubDir+"/"+fileCounter+".xml";
fileCounter++;
console.log("Sending request to "+ fullUrl + " ...");
//REQUEST SENT HERE; SAME CODE AS ABOVE.
var request = require('request');
request('http://www.google.com', function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Print the google web page.
}
})
sleep.usleep(500000); //dont hammer the server
}
}
}
});
The log shows
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.s737f/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.s737f/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.1fd83/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.1fd83/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.4vk6d/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.c3k8m/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.5410v/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.492r0/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.m6g1b/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.m6g1b/1/mets.xml ...
Sending request to http://datadryad.org/resource/doi:10.5061/dryad.4dm30/mets.xml ...
But no html is printed (it should print the google homepage many times, as I am not using the url's I parse from the json yet, to rule out problems with the intended server.
Sorry for the long letter, but I am at a loss at this behaviour (still learning nodejs... :-O)
It seems that the issue was related to the "sleep" call, so I implemented a basic connection queue with the semaphore library. I now specify a maximum of 10 simultaneous connections, here is my code:
var makeRequestAndSaveToFile = function(url, absolutePath)
{
sem.take(function(){
console.log("Sending request to "+ url + " ... and saving to file "+absolutePath);
request(url, function(error,response, body) {
if (!error && response.statusCode == 200) {
fs.writeFile(absolutePath, body, function(err) {
sem.leave();
if(err) {
console.log(err);
} else {
console.log("The file was saved!");
}
});
}
});
});
}
I call this function for each link I want to download.
Note that this will not handle big downloads as there is no piping, and the links will be downloaded in an unorderly fashion like Slavo said in his comments.