How to crawl data from yelp.com without block our IP

How to crawl data from yelp.com without block our IP - node.js

I am using node packages of cheerio and nightmare for crawling from " Yelp.com ". I am retrieved data from Yelp.com.
But yelp has blocked my IP.
Please any one can provide solution or suggestions.Thanks in advance
Here is my code
var Nightmare = require('nightmare');
var fs = require('fs');
var http = require('http');
var cheerio = require('cheerio');
var request = require('request');
function yelpmenuitemsscrap(url)
{
// console.log(url);
var menuitems = new Nightmare();
menuitems.goto(url);
menuitems.wait();
menuitems.evaluate(function () {
var objs = [];
$('div.menu-sections div.media-block.menu-
item').each(function(index){
objs.push( $(this).find('div.media-story h4').text().trim());
});
return objs;
},function (html) {});
menuitems.run(function(err, nightmare) {
if (err)
{
return console.log(err);
}
else
{
console.log(nightmare);
};
});
}

IP block is done because all the requests are being generated from the same server IP address which appears to be a hack attack on the server. In such requirements the data crawl should be done from the clients who are accessing the applications from different IP addresses. This will generate the traffic from different IP addresses and Yelp will not block it.
The other option is to use multiple HTTP Proxy servers which generate requests from all the different servers randomly so as not to block any specific IP address.
Another option is to use something like http://www.screen-scraper.com

If you don't mind using and api, you can try https://gimmeproxy.com which has nice wrapper gimmeproxy-request.
It automatically gets proxies from GimmeProxy and re-routes requests through another proxy when one fails.
Example how to make request with this wrapper:
const setup = require('gimmeproxy-request').setup;
const request = require('gimmeproxy-request').request;
setup({
api_key: 'your api key',
query: 'get=true&cookies=true&country=US&supportsHttps=true&maxCheckPeriod=1800&minSpeed=10', // additional gimmeproxy query parameters
retries: 5, // max retries before fail
test: (body, response) => body.indexOf('captcha') === -1 && response.statusCode === 200 // test function
});
request('https://example.com', {
timeout: 10000 // additional request parameters, see https://github.com/request/request
},
function(err, res, body) {
console.log('err', err)
console.log('res', res)
console.log('body', body)
process.exit()
});

Related

Http request from Node (Express) to Yii2 api endpoint

I have to do request from node to Yii2 api. It doesn't throw any errors, but doesn't return anything either. When I do request to Yii2 api method directly in browser, value is returned. Here is my request in route in node:
router.get('', function (req, res) {
var parameter = 20;
request({
url: 'http://**.**.**.***:8000/web/index.php?r=api/get-value',
parameter: parameter,
method: 'GET'
}, function(error, response, body) {
if(error || response.statusCode != 200)
throw error;
res.send(body);
});
});
module.exports = router;
And here is method/endpoint in Yii2 controllers/apiController.php:
public function actionGetValue($inverterId) {
return $inverterId * 2;
}
Any suggestions what could be wrong/missing?

You can use the following
var http = require('http');
var client = http.createClient(8000, 'localhost');
var request = client.request('GET', '/web/index.php?r=api/get-value');
request.write("stuff");
request.end();
request.on("response", function (response) {
// handle the response
});
Resource Link:
Http request with node?
Sending http request in node.js
or Another full example:
Get requests
Now we’ll set up a super simple test to make sure it’s working. If it’s not still running, run your simple Node server so that it’s listening on http://localhost:8000. In a separate file in the same directory as your http-request.js where your new module lives, add a file called test-http.js with the following contents:
// test-http.js
'use strict';
const
request = require('./http-request'),
config = {
method: 'GET',
hostname: 'localhost',
path: '/',
port: 8000
};
request(config).then(res => {
console.log('success');
console.log(res);
}, err => {
console.log('error');
console.log(err);
});
This will import our module, run a request according to the configured options, and console log either the response, or an error if one is thrown. You can run that file by navigating to its directory in the command line, and typing the following:
$ node test-http.js
You should see the following response:
success
{ data: 'Cake, and grief counseling, will be available at the conclusion of the test.' }
Resource Link:
https://webcake.co/sending-http-requests-from-a-node-application/

Okay, shame on me, I did not check, what's going on in public function beforeAction($action) in apiController.php - since request to endpoint getValue() is done from the "outside", it falls under a condition, that does not allow further actions and returns false - that's why response wasn't changing no matter what was done/set in getValue().

what should I use instead of readableStream.push('')

I am trying to implement the ._read function of a readable stream, a problem happens when ._read is called and there isn't data, the documentation says that I can push('') until more data comes, and I should only return false when the stream will never have more data.
https://nodejs.org/api/stream.html#stream_readable_read_size_1
But it also says that if I need to do that then something is terribly wrong with my design.
https://nodejs.org/api/stream.html#stream_stream_push
But I can't find an alternative to that.
code:
var http = require('http');
var https = require('https');
var Readable = require('stream').Readable;
var router = require('express').Router();
var buffer = [];
router.post('/', function(clientRequest, clientResponse) {
var delayedMSStream = new Readable;
delayedMSStream._read = function() {
var a=buffer.shift();
if(typeof a === 'undefined'){
this.push('');
return true;
}
else {
this.push(a);
if(a===null) {
return false;
}
return true;
}
};
//I need to get a url from example.com
https.request({hostname:'example.com'}, function(exampleResponse){
data='';
exampleResponse.on('data',function(chunk){data+=chunk});
exampleResponse.on('end',function(){
var MSRequestOptions = {hostname: data, method: 'POST'};
var MSRequest = https.request(MSRequestOptions, function(MSResponse){
MSResponse.on('end', function () {
console.log("MSResponse.on(end)");//>>>
});//end MSResponse.on(end)
}); //end MSRequest
delayedMSStream.pipe(MSRequest);
});
});
clientRequest.on('data', function (chunk) {
buffer.push(chunk);
});
clientRequest.on('end', function () {//when done streaming audio
buffer.push(null);
});
});//end router.post('/')
explanation:
client sends a POST request streaming audio to my server, my server requests a url from example.com, when example.com responds with the url, my server streams the audio to it.
What's a smarter way to do it?

So if I undertstand the code correctly, you:
receive a request,
make your own request to a remote endpoint and fetch a URL
make a new request to that URL and pipe that to original response.
There are ways to do this other then yours, and even your way would look cleaner to me if you just improve the naming a bit. Also, splitting the huge request into a few functions with smaller responsibility scopes might help.
I would make the endpoint this way:
let http = require('http');
let https = require('https');
let Readable = require('stream').Readable;
let router = require('express').Router();
let buffer = [];
/**
* Gets some data from a remote host. Calls back when done.
* We cannot pipe this directly into your stream chain as we need the complete data to get the end result.
*/
function getHostname(cb) {
https.request({
hostname: 'example.com'
}, function(response) {
let data = '';
response.on('error', err => cb(err)); // shortened for brewity
response.on('data', function(chunk) {
data = data + chunk;
});
response.on('end', function() {
// we're done here.
cb(null, data.toString());
});
});
}
router.post('/', function(request, response) {
// first let's get that url.
getHostname(function(err, hostname) {
if (err) { return response.status(500).end(); }
// now make that other request which we can stream.
https.request({
hostname: hostname,
method: 'POST'
}, function(dataStream) {
dataStream.pipe(response);
});
});
});
Now, as said in the comments, with streams2, you don't have to manage your streams. With node versions pre 0.10 you have had to listen to 'read', 'data' etc events, with newer node versions, it's handled. Furthermore, you don't even need it here, streams are smart enough to handle backpressure on their own.

Node.js request web page

I need to connect to a web page and return the status code of the page, which I've been able to achieve using http.request however the pages I need to request can take a long time, sometimes several minutes, so I'm always getting a socket hang up error.
I'm using the following code so far:
var reqPage = function(urlString, cb) {
// Resolve the URL
var path = url.parse(urlString);
var req = http.request({
host: path.hostname,
path: path.pathname,
port: 80,
method: 'GET'
});
req.on('end', function() {
cb.call(this, res);
});
req.on('error', function(e) {
winston.error(e.message);
});
};
What do I need to do to ensure that my application still attempts to connect to the page even if it's going to take a few minutes?

Use the request module and set the timeout option to an appropriate value (in milliseconds)
var request = require('request')
var url = 'http://www.google.com' // input your url here
// use a timeout value of 10 seconds
var timeoutInMilliseconds = 10*1000
var opts = {
url: url,
timeout: timeoutInMilliseconds
}
request(opts, function (err, res, body) {
if (err) {
console.dir(err)
return
}
var statusCode = res.statusCode
console.log('status code: ' + statusCode)
})

Add this if you don't want to use a higher level http client like request or superagent , then add this...
req.on("connection", function(socket){
socket.setTimeout((1000*60*5)); //5 mins
});

ExpressJS - contact external API

Here is the thing :
I have a client which sends data to a server. This server has to contact an external A.P.I. and send back its response to the client. I just can't figure out how and where I can contact the external A.P.I once the server has got the client data.
I route client data like this :
app.post('/getAutoComplete', routes.read);
routes.read retrieves the data within req.body. With my nodejs version (without express framework), I then request the api this way :
var http = require('http'), options = {
host : "192.168.1.38",
port : 8080,
path : "/myURL",
method : 'POST'
};
var webservice_data = "";
var webservice_request = http.request(options, function(webservice_response)
{
webservice_response.on('error', function(e){ console.log(e.message); });
webservice_response.on('data', function(chunk){ webservice_data += chunk;});
webservice_response.on('end', function(){res.send(webservice_data);});
});
webservice_request.write(req.body);
webservice_request.end();
The problem is that i'd like to use native expressJS method like app.post but I don't know how because :
Express (app) object is not available here (declared in app.js but not in the route file)
I don't know how to send POST data with app.post
Any suggestion ?

app.post('/getAutoComplete', routes.read);
// assuming routes.read lookes something like this
routes.read = function read(req, res) {
var http = require('http'), options = {
host : "192.168.1.38",
port : 8080,
path : "/myURL",
method : 'POST'
};
var webservice_data = "";
var webservice_request = http.request(options, function(webservice_response)
{
webservice_response.on('error', function(e){ console.log(e.message); });
webservice_response.on('data', function(chunk){ webservice_data += chunk;});
webservice_response.on('end', function(){res.send(webservice_data);});
});
webservice_request.write(req.body);
webservice_request.end();
};
Also check out https://github.com/mikeal/request It's the de-facto module for doing web requests in node.

routes.read is a function. You can call it with extra parameters, so for example
app.post('/getAutoComplete', function(req,res) {
var q = req.query.q; // or whatever data you need
routes.read(q, function(err, response) {
if (err) throw err;
return res.json(response);
});
});
Now make the routes.read function use the first parameter as the query and when it's gathered the response from the remote API, call the second parameter with any error as the first parameter and the response as the second one.
Update This answer has already been picked as an answer, but it'd be more helpful if I showed an example of routes.read, too:
routes.read = function(q, cb) {
// pretend we calculate the result
var result = q * 10;
if (result > 100) {
// call the callback with error set
return cb("q value too high");
}
// all is well, use setTimeout to demonstrate
// an asynchronous return
setTimeout(function() { cb(null, result) }, 2000);
};

Node.js http request memory and cpu hog

I have a node.js script that continuously requests a page, sort of like a cron job.
However, after a few minutes Node starts to use a lot of CPU (up to 70%) and memory (up to 200mb).
What is wrong with my script?
function cron(path)
{
var http = require('http');
var site = http.createClient(443, 'www.website.com', true);
var request = site.request('GET', path, {'host': 'www.website.com'});
request.end();
request.on('response', function (response) {
setTimeout(function(){cron(path)},15000);
});
}
cron('/path/to/page');

request.on('response', function (response) {
setTimeout(function(){cron(path)},15000);
});
For every response you create a new cron job. Log your responses. If your getting more then 1 from your request then your exponantially creating more cron jobs.
Your creating a function() {} with a reference to path. So the entire scope state is kept. you want to free memory by adding this:
var site = null;
var request = null;
Your calling require("http") inside a function rather then outside in module scope. You only need to get http once so place at the top of your file in module scope.
var http = require('http');
var site = http.createClient(443, 'www.website.com', true);
function cron(path)
{
var request = site.request('GET', path, {'host': 'www.website.com'});
request.end();
var once = true;
request.on('response', doIt);
function doIt(response) {
if (!once) {
once = null;
doIt = function() {};
setTimeout(function(){cron(path)},15000);
}
});
site = null;
request = null;
}
cron('/path/to/page');

In addition to the tips from #Raynos, here's another. I find that recursive calls like this in long running processes make me a bit nervous so I'd err on the side of using setInterval instead. I'd maybe split the cron and the http behaviour apart in case you want to try and re-use that logic, although that'll depend on your context:
e.g. in node 0.4.7:
var https = require('https');
function poll(path)
{
https.get({
host: 'www.website.com',
port: 443,
path: path
}, function(res) {
console.log("Got response: " + res.statusCode);
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
}
function cron(path)
{
return setInterval(function(){
poll(path);
},15000);
}
var intervalId = cron('/path/to/page'); // keep in case you need to use clearInterval

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How to crawl data from yelp.com without block our IP - node.js

Related

Http request from Node (Express) to Yii2 api endpoint

what should I use instead of readableStream.push('')

Node.js request web page

ExpressJS - contact external API

Node.js http request memory and cpu hog

Categories

Resources