Get HTML page by URL - node.js

Here is my code:
utilitesRouter.route('/url')
.post(function(request, response) {
console.log(request.body.uri);
var urlOpts = { host: request.body.uri, path: '/', port: '80', method: 'GET' };
var re = /(<\s*title[^>]*>(.+?)<\s*\/\s*title)>/gi;
http.get(urlOpts, function (response) {
response.on('data', function (chunk) {
var str=chunk.toString();
console.log(str);
var match = re.exec(str);
if (match && match[2]) {
console.log(match[2]);
}
});
});
response.json({ url: request.body.uri });
});
If I use POST request with this JSON {"uri":"google.ru" } I get:
302 Moved
google.ru
<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
<TITLE>301 Moved</TITLE></HEAD><BODY>
<H1>301 Moved</H1>
The document has moved
here.
</BODY></HTML>
If I use POST requiet with JSON {"uri":"http://google.ru" } I get the error message:
events.js:85
throw er; // Unhandled 'error' event
^
Error: getaddrinfo ENOTFOUND http://google.ru
at errnoException (dns.js:44:10)
at GetAddrInfoReqWrap.onlookup [as oncomplete] (dns.js:94:26)
I can open http://google.ru in my browser.
How can I get the HTML using node.js ?

You may want to request to do that. It just pretty easy.
var request = require("request");
router.get('/proxy', function(req, res, next){
request.get( req.body.uri, function(error, response, body){
if( error )
return next(error);
res.send(body);
});
});
request also support streaming and other cool features too.

You get the error because in your urlOpts the attribute host has to be a domain name, like google.ru or www.google.ru. As you are putting a URL into it, it can't be resolved to an IP via DNS, that's why you get the error at GetAddrInfoReqWrap.onlookup [as oncomplete] (dns.js:94:26).
If you want to use http.get() like the way you do, you would always have to extract the domain part out of your passed uri, i.e. getting google.ru out of http://google.ru to use it as host.

Related

getting Error: getaddrinfo ENOTFOUND while performing rest api call in node.js using http.request

i have created api in node.js which consume set of api hosted at http://dev.abc.co.in:20081
not every time but randomly sometimes it throws the error
Error: getaddrinfo ENOTFOUND dev.abc.co.in
at GetAddrInfoReqWrap.onlookup [as oncomplete] (dns.js:60:26) {
errno: 'ENOTFOUND',
code: 'ENOTFOUND',
syscall: 'getaddrinfo',
hostname: 'dev.abc.co.in'
}
to call those api i have used request node module because i started getting this error i switched to fetch-node npm module and finally replace the code with internal node module http but getting same error
here is the code i have written using http.request
try{
const options = {
hostname: "dev.abc.co.in",
port : 20081,
path: "/api/entity/workorder",
method: Config.method
};
if(Config.headers){
options.headers = Config.headers
}
const req = http.request(options, (res) => {
let data = '';
res.on('data', (chunk) => {
data += chunk;
});
res.on('end', () => {
callback(res, data);
});
req.socket.destroy();
}).on("error", (err) => {
console.log("===Error: ", err);
callback(null, err);
});
if(Config.method!="GET" && Config.body){
Config.headers["Content-Length"] = Config.body.length;
req.write(Config.body);
}
req.end();
}catch(e){
console.log("Exception=====",e);
}
as shown in error message issue related to DNS so i try to resolve this DNS using
node -pe 'require("dns").lookup("dev-vsg.dovertech.co.in",function(){console.dir(arguments)})
but still not resolved.
1) Omit 'http://' from the beginning of your demain and all slashes from the end or any path after the actual domain.
2) Try to resolve your hostname:
const dns = require('dns');
dns.resolve("testdomain.com", 'ANY', (err, records) => {
if (err) {
console.log("Error: ", err);
} else {
console.log(records);
}
});
If dns records has been returned, then you will know it's a node js problem and after that we can investigate further. If not, then it's a domain configuration issue.

Error: getaddrinfo ENOTFOUND parishackers.org parishackers.org:80

Error: getaddrinfo ENOTFOUND parishackers.org parishackers.org:80 error happens.
I wrote the codes,
var Crawler = require("node-webcrawler");
var url = require('url');
var c = new Crawler({
maxConnections : 10,
// This will be called for each crawled page
callback : function (error, result, $) {
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
if(error){
console.log(error);
}else{
console.log($("title").text());
}
}
});
// Queue just one URL, with default callback
c.queue('http://www.amazon.com');
// Queue a list of URLs
c.queue(['http://www.google.com/','http://www.yahoo.com']);
// Queue URLs with custom callbacks & parameters
c.queue([{
uri: 'http://parishackers.org/',
jQuery: false,
// The global callback won't be called
callback: function (error, result) {
if(error){
console.log(error);
}else{
console.log('Grabbed', result.body.length, 'bytes');
}
}
}]);
// Queue some HTML code directly without grabbing (mostly for tests)
c.queue([{
html: '<p>This is a <strong>test</strong></p>'
}]);
but when I run the code,
Google
Yahoo
Amazon.com: Online Shopping for Electronics, Apparel, Computers, Books, DVDs & more
{ Error: getaddrinfo ENOTFOUND parishackers.org parishackers.org:80
at errnoException (dns.js:50:10)
at GetAddrInfoReqWrap.onlookup [as oncomplete] (dns.js:92:26)
code: 'ENOTFOUND',
errno: 'ENOTFOUND',
syscall: 'getaddrinfo',
hostname: 'parishackers.org',
host: 'parishackers.org',
port: 80 }
error happens. I think program scrape data part of it, but I do not know why program can do it completely. I installed library like npm install node-webcrawler. I read another site, so I think this error happens because of wrong link, right? How should I fix this? What is wrong in my code?
Error (getaddrinfo ENOTFOUND parishackers.org parishackers.org:80) is throwed because of invalid domain http://parishackers.org. Use valid url links, your node webcrawler will works like a charm. Modified the snippet for your reference
var Crawler = require("node-webcrawler");
var url = require('url');
var c = new Crawler({
maxConnections : 10,
// This will be called for each crawled page
callback : function (error, result, $) {
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
if(error){
console.log(error);
}else{
console.log($("title").text());
}
}
});
// Queue just one URL, with default callback
c.queue('http://www.amazon.com');
// Queue a list of URLs
c.queue(['http://www.google.com/','http://www.yahoo.com']);
// Queue URLs with custom callbacks & parameters
c.queue([{
uri: 'http://www.amazon.com',
jQuery: false,
// The global callback won't be called
callback: function (error, result) {
if(error){
console.log(error);
}else{
console.log('Grabbed', result.body.length, 'bytes');
}
}
}]);
// Queue some HTML code directly without grabbing (mostly for tests)
c.queue([{
html: '<p>This is a <strong>test</strong></p>'
}]);

Twitter stream api

I am trying to learn and understand nodejs. While trying to connect to api of Twitter stream and track tweets, I get following error :
undefined:1
<html>\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=ut
^
SyntaxError: Unexpected token <
at Object.parse (native)
at IncomingMessage.<anonymous> (/home/ytsejam/public_html/nodejs/11/twitter.js:15:20)
at IncomingMessage.emit (events.js:95:17)
at IncomingMessage.<anonymous> (_stream_readable.js:764:14)
at IncomingMessage.emit (events.js:92:17)
at emitReadable_ (_stream_readable.js:426:10)
at emitReadable (_stream_readable.js:422:5)
at readableAddChunk (_stream_readable.js:165:9)
at IncomingMessage.Readable.push (_stream_readable.js:127:10)
at HTTPParser.parserOnBody [as onBody] (http.js:142:22)
here is my code which I try to connect :
var https = require("https");
var options = {
host: 'stream.twitter.com',
path: '/1.1/statuses/filter.json?track=bieber',
method: 'GET',
headers: {
"Authorization": "Basic " + new Buffer("username:password").toString("base64")
}
};
var request = https.request('https://stream.twitter.com/1.1/statuses/filter.json?track=query', function(response){
var body = '';
response.on("data", function(chunk){
var chunk = chunk.toString();
try {
var tweet = JSON.parse(chunk);
} catch (err) {
console.log("JSON parse error:" + err);
}
console.log(tweet.text);
});
response.on("end",function(){
console.log("Disconnected");
});
});
request.end();
I did a research and tried to debug. my best guess is var tweet = JSON.parse(chunk); may cause problems. second option, I am missing oauth parameters.
Can you help me ? Thanks.
Edit :
I solved this using answer here Node.js and Twitter API 1.1
JSON.parse() is throwing a SyntaxError because the data it is trying to parse is HTML and not JSON.
In general, it's a good idea to wrap JSON.parse() in a try/catch block so you can handle those sorts of things gracefully.
(It is possible that there is a problem in your oauth stuff and it is failing to authenticate. So instead of getting JSON, you are getting an HTML page telling you that authentication has failed. But that is just a guess.)

How to catch getaddrinfo ENOTFOUND

I have a list of links that I need to check before processing some data. Checking headers with http.get returns error:
events.js:72
throw er; // Unhandled 'error' event
^
Error: getaddrinfo ENOTFOUND
at errnoException (dns.js:37:11)
I cannot handle this error, and exits the process. I tried res.on("error") and try..catch on http.get but nothing works.
Below is the code snippet, and here is live example at runnable.com
//This is OK
getHeaders('http://google.com/404pag-that-does-not-exit');
//Here is the error.
//Uncoughtable error!
getHeaders('http://doesnotexistooooo.com');
function getHeaders(link){
var _http = require("http");
var myUrl = require("url");
var qs=(myUrl.parse(link).search==null) ? "" : myUrl.parse(link).search ;
var path=myUrl.parse(link).pathname;
var options = {
hostname: myUrl.parse(link).hostname,
path: path+qs,
method: 'HEAD'
};
_http.get(options, function(res) {
res.on('error',function(e){
console.log("Error: " + myUrl.parse(link).hostname + "\n" + e.message);
console.log( e.stack );
});
console.log('STATUS: ' + res.statusCode);
console.log('HEADERS: ' + JSON.stringify(res.headers));
});
}
You just need to handle the error event, as stated in the error message. According to the documentation:
If any error is encountered during the request (be that with DNS resolution, TCP level errors, or actual HTTP parse errors) an 'error' event is emitted on the returned request object.
Here is a usage example:
var getRequest = _http.get(options, function(res) {
// …
});
getRequest.on('error', function (err) {
console.log(err);
});
which yields:
$ node test.js
{ [Error: getaddrinfo ENOTFOUND] code: 'ENOTFOUND', errno: 'ENOTFOUND', syscall: 'getaddrinfo' }
At the very top level, you can do
process.on('uncaughtException', function(err) {
console.log('### BIG ONE (%s)', err);
});
if you using request npm
request
.get('http://example.com/doodle.png')
.on('response', function(response) {
console.log(response.statusCode) // 200
console.log(response.headers['content-type']) // 'image/png'
})
.on('error', function(err) { // <------- add this
console.log(err)
});

Getting ETIMEDOUT error when I try to do a simple Get request?

Hi I am trying to call a simple web API which returns a string as response. I want to use node for this. Since I am new to node so I tried reffering to many blog post and got a code snippet which I used but I am getting same error for all urls whether its google.com or anything else.
My Node code is as follows
var http = require('http');
//The url we want is: 'www.random.org/integers/?num=1&min=1&max=10&col=1&base=10&format=plain&rnd=new'
var options = {
host: 'www.random.org',
path: '/integers/?num=1&min=1&max=10&col=1&base=10&format=plain&rnd=new'
};
callback = function(response) {
var str = '';
//another chunk of data has been recieved, so append it to `str`
response.on('data', function (chunk) {
str += chunk;
});
//the whole response has been recieved, so we just print it out here
response.on('end', function () {
console.log(str);
});
}
http.request(options, callback).end();
Error:
F:\nodejs>node ..\NodeLearning\TestServer1\test.js
events.js:72
throw er; // Unhandled 'error' event
^
Error: connect ETIMEDOUT
at errnoException (net.js:901:11)
at Object.afterConnect [as oncomplete] (net.js:892:19)
Can Any one tell me what has gone wrong here?
Can you try one more time by setting a proxy like mentioned below
var options = {
host: 'www.random.org',
path: '/integers/?num=1&min=1&max=10&col=1&base=10&format=plain&rnd=new',
proxy:'add your proxy setting'
};

Resources