I have been trying to figure the following for the last couple of days and just can't seem to figure out the answer. I am new to node and JS (only experience is online tutorials).
I am trying to create a class (function) to scrape the source code from websites. I want to read in a url from the command line and return the html content. However, I seem to be getting different results when running the code different ways (which I think I should be getting the same results).
I have been reading about events in node and so I have used them a little in the code. One listener event prompts the me for the url and then after setting the url it (the listener function) emits a message, which is picked up by another listener which goes out and fetches the html content.
The problem I am having is that when I create an instance of the object, it seems like the request portion of the code does not execute. However, if I call the method from the instance I get the print out of the html content of the page.
Any help is appreciated. Thanks.
function test() {
var events = require('events').EventEmitter;
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
console.log("Input the URL: ");
process.stdin.resume();
process.stdin.setEncoding('utf8');
process.stdin.on('data', function (text) {
that.url = util.inspect(text);
that.url = that.url.substr(1, that.url.length - 4);
that.eventEmitter.emit('Get url html');
process.exit();
});
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
scrapper.httpGet(); // This gives the desired results from that.httpGet
I solved using the Prompt library https://www.npmjs.com/package/prompt
function test() {
var events = require('events').EventEmitter;
var prompt = require('prompt');
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
prompt.start();
process.stdin.setEncoding('utf8');
prompt.get(['url'], function( err, result ) {
that.url = result.url;
that.eventEmitter.emit('Get url html');
} );
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
// scrapper.httpGet(); // This gives the desired results from that.httpGet
I ran the script from the commandline, input http://www.google.com and it retrieved the results without the additional call to scrapper.httpGet();
Related
I've written a script in node using two different functions getPosts() and getContent() supplying callback within them in order to print the result calling a standalone function getResult(). The selectors defined within my script is flawless.
However, when I execute my script, It prints nothing. It doesn't throw any error either. I tried to mimic the logic provied by Neil in this post.
How can I make it a go?
I've written so far:
var request = require('request');
var cheerio = require('cheerio');
const url = 'https://stackoverflow.com/questions/tagged/web-scraping';
function getPosts(callback){
request(url, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
$('.summary .question-hyperlink').each(function(){
var items = $(this).text();
var links = $(this).attr("href");
callback(items,links);
});
}
});
}
function getContent(item,link,callback){
request(link, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
var proLink = $('.user-details > a').eq(0).attr("href");
callback({item,link,proLink});
}
});
}
function getResult() {
getPosts(function(item,link) {
getContent(item,link,function(output){
console.log(output);
});
});
}
getResult();
The link value that you receive from getPosts is a relative link which means that the request fails. You can extract the hostname inside its own variable and create the full URL from the hostname + the relative link.
const host = 'https://stackoverflow.com';
const url = '/questions/tagged/web-scraping';
// ...
function getContent(item,link,callback){
// Here we use the absolute URL
request(host + link, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
var proLink = $('.user-details > a').eq(0).attr("href");
callback({item,link,proLink});
}
});
}
I'm trying to write some parser on node. I can't understand something here.
I need to stop requesting next pages if previous result was wrong.
My current code is something like this:
var request = require('request');
var cheerio = require('cheerio');
var links = ['http://link1','http://link2','http://link3'];
for(l in links) {
var link = links[l];
request(link, function(err, page) {
if(err) throw err;
$ = cheerio.load(page.body);
if($('a').length < 2) {
// here i need to stop requesting next url(s) from links array somehow!
// (if this is the case of link1 then link2 and link3 will not request)
} else {
// do something...
}
$.html();
});
}
Node.js is an asynchronous language, so you can't stop it in native javascript like break;, so you probably try using the async module like this:
var request = require('request');
var cheerio = require('cheerio');
var links = ['http://link1', 'http://link2', 'http://link3'];
var async = require('async');
async.eachSeries(links, function(link, callback) {
request(link, function(err, page) {
if (err) {
callback(err);
return;
}
$ = cheerio.load(page.body);
if ($('a').length < 2) {
// here i need to stop requesting next url(s) from links array somehow!
// (if this is the case of link1 then link2 and link3 will not request)
callback(err);
return;
} else {
// do something...
}
$.html();
callback();
});
}, function(err) {
if (err) {
//do something id some of the links throws err
return;
}
//do womething if every request was success
});
The above code will run a function for each element in the array(links), if one of the links will call callback with error, it will immidiately stop the iteration and will call the callback function with the error. Otherwise if all of the elements will call callback without any params, the callback function will be called without any params.
Please look at async docs to see other options that might help you in other ways.
I have var movieRecommendation which is being populated from data coming from Mongo DB. Issue is Mongoose Movie.findOne() call is asycn call which is not allowing me to get my final populated movieRecommendation which I need to send back as response.
exports.getRecommendation=function(req,res){
var movieRecommendation = [];
var id=req.params.id;
console.log('----- Get User Recommendation - ' + id);
var url = 'http://52.8.48.113:8080/recommender-server/recommender/v1/recommendations/'+id+'.do';
//make http get request
request({
url: url,
json: true
}, function (error, response, recommendations) {
// res.json(recommendations);
if (!error && response.statusCode === 200) {
recommendations.forEach(function(entry) {
**Movie.findOne({'id':parseInt(entry.itemId)},function(err, movieData){**
entry.movie = movieData;
movieRecommendation.push(entry);
//console.log('rec', movieRecommendation);
console.log(movieRecommendation.length);
});
});
}
console.log("====Final========"+movieRecommendation.length);
//Output = 0
});
res.json(movieRecommendation); // Here movieRecommendation is coming as black Array
};
Please let me know how I can get finally populated movieRecommendation var at end to make it available for response.
For this type of issues we can use Async library. To populate the data finally once all the operations done, we can use async.each collection from Async Library.
For example:
NOTE:Install Async by this command
npm install async to use async library
var async = require("async");
var recomenmendations = [{"data2" : "value2"} , {"data1" : "value2"}, {"data3" : "value3"}, {"data4" : "value4"} ]
var movieRecommendation = [];
async.each(recomenmendations,
function(recomenmendationItem, callback){
console.log("Here you can query the required data using current recomenmendations ITEM");
console.log(recomenmendationItem);
callback();
// Movie.find({'id':parseInt(recomenmendationItem.itemId)},function(err, movieData){
// recomenmendationItem.movie = movieData;
// movieRecommendation.push(entry);
// callback();
// });
},
function(err){
console.log("here you can send your resopnse");
console.log("This section will be executed once all the recomenmendations are processed");
//res.json(movieRecommendation)
}
);
You can query the mongoDB as shown with comment section. You should use callback() once all the operations performed for an iteration.
As I mentioned in one of my comments, use the callback passed to the iterator function and call it inside the Movie.findOne() callback. That way, async.each will know when each step has finished:
async.each(recomendations, function (recomendationItem, callback) {
Movie.findOne({'id':parseInt(entry.itemId)},function(err, movieData){
if (err) return callback(err); // if you have an error on you search, just pass it to the iterator callback
recommendationItem.movie = movieData;
movieRecommendation.push(recommendationItem);
callback();
});
}, function (error) {
if (error) return res.json ({ error: error }); // you should also check if an error ocurred
res.json(movieRecomendation);
});
Just to point out: you can also use async.eachSeries, that will just call the next step of your iteration when the previous one has returned (if that matters to you, but I think it's not your case though) and it has the same signature.
#Vivek Panday replace the following code inside your exports.getRecommendation function to get your expected output. We don't need to use the count variable if we use the callback function. And an important thing is we have to use callback(); once all the process done. I think you have not used callback function properly in The example you have worked out. Use the following code If there is any issue please let me know.
var async = require('async');
var request = require("request");
var movieRecommendation = [];
var id=req.params.id;
console.log('----- Get User Recommendation - ' + id);
var url = 'http://52.8.48.113:8080/recommender-server/recommender/v1/recommendations/'+id+'.do';
//make http get request
request({
url: url,
json: true
}, function (error, response, recommendations) {
if (!error && response.statusCode === 200) {
console.log('recommendation lenght '+ recommendations.length);
async.each(recommendations,
function(recommendationItem, callback){
Movie.findOne({'id':parseInt(recommendationItem.itemId)},function(err, movieData){
recommendationItem.movie = movieData;
movieRecommendation.push(recommendationItem);
//you have to use callback(); once all your process is done
callback();
});
},
function(err){
//you should use this function, this will be execute once all the process done
console.log(movieRecommendation);
console.log("finally callback");
res.json(movieRecommendation);
}
);
}
});
I have tried as per given suggestion above ..
var async = require("async");
var recomenmendations = [{"data2" : "value2"} , {"data1" : "value2"}, {"data3" : "value3"}, {"data4" : "value4"} ]
var movieRecommendation = [];
async.each(recomenmendations,
function(recomenmendationItem, callback){
console.log("Here you can query the required data using current recomenmendations ITEM");
console.log(recomenmendationItem);
// Movie.find({'id':parseInt(recomenmendationItem.itemId)},function(err, movieData){
recomenmendationItem.movie = movieData;
movieRecommendation.push(entry);
console.log("any data"); // line y
});
callback();
},
function(err){
console.log("here you can send your resopnse"); // line x
console.log("This section will be executed once all the
recomenmendations are processed");
//res.json(movieRecommendation)
}
);
But still face same issue line x is printing before line y ,which is making again same issue.
However I have tried something given below and achieved expected result .
exports.getRecommendation=function(req,res){
var movieRecommendation = [];
var id=req.params.id;
console.log('----- Get User Recommendation - ' + id);
var url = 'http://52.8.48.113:8080/recommender-server/recommender/v1/recommendations/'+id+'.do';
//make http get request
request({
url: url,
json: true
}, function (error, response, recommendations) {
// res.json(recommendations);
if (!error && response.statusCode === 200) {
console.log('recommendation lenght '+ recommendations.length);
// recommendations.forEach(function(entry) {
var count=0;
async.each(recommendations,function(recommendationItem){
// console.log(recommendationItem);
Movie.findOne({'id':parseInt(recommendationItem.itemId)},function(err, movieData){
recommendationItem.movie = movieData;
movieRecommendation.push(recommendationItem);
count ++;
console.log('final res length : ' + movieRecommendation.length);
console.log('final res length count : ' + count +' and item recomm lenght ' + recommendations.length );
if(count === recommendations.length){
console.log(' =====Final=====> here you can send your response =========' + movieRecommendation.length);
res.json(movieRecommendation);
}
});
// callback();
});
}
});
};
Still I am open for any feedback and suggestions.
I have been playing with nodejs and zombiejs to fetch some personal data from a site. Unfortunately I am stuck at a point where zombiejs only gets me the data from first link and then hangsup.
The steps I follow are-
Go to to the base url
Get the number of pages
Use async library to fetch them in series by opening a new browser window everytime. Note I only create a browser window instead of a totally new browser instance as it expensive to create one.
This is my code
var Browser = require("zombie");
var async = require('async');
var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
var browser = new Browser();
browser.visit(so_base, function () {
var arr=[];
for(var i=1;i<=10;i++) {
arr.push(i);
}
async.eachSeries(
arr,
function(k, callback) {
browser.open();
browser.visit(so_url+k,function() {
console.log(browser.location.href);
console.log(browser.html());
});
},
function(e) {
console.log(e);
});
});
Results
>node main_zombie.js
..... HTML DUMP
http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=1
>
Any suggestions would be appreciated
Found the mistake
As per https://github.com/caolan/async#each
One needs to call the callback function with empty arguments or null if there is no error.
So the correct code would be
var Browser = require("zombie");
var async = require('async');
var so_base="http://stackoverflow.com";
var so_url="http://stackoverflow.com/questions/tagged/java?sort=newest&pagesize=15&page=";
var browser = new Browser();
browser.visit(so_base, function () {
var arr=[];
for(var i=1;i<=10;i++) {
arr.push(i);
}
async.eachSeries(
arr,
function(k, callback) {
browser.open();
browser.visit(so_url+k,function() {
console.log(browser.location.href);
console.log(browser.html());
// Add callback and check if we reached the last page
if (k == 10) {
browser.close();
}
callback();
});
},
function(e) {
console.log(e);
});
});
I have an http server with a handleRequest callback that runs another script in vm.runInNewContext for each request. The script that runs inside vm.runInNewContext makes some asynchronous http post requests and writes the server response only after getting the responses from the posts.
As a result, the code of handleRequest callback ends before the server response is written.
Is it safe? or is there a way to avoid this situation?
Here is some code:
var server = http.createServer(handleRequest);
server.listen(8080);
var handleRequest = function (request, response) {
// get request data...
var context = {
ServerRequest : request,
ServerResponse : response
};
var stringScript = // a string with the script that posts data
var script = vm.createScript(stringScript);
script.runInNewContext({ context: context });
}
the script string does this:
var request = require('request');
var options = {....}
var req = request.get(options);
req.on('response', function (res) {
var chunks = [];
res.on('data', function(chunk) {
chunks.push(chunk);
});
res.on('end', function() {
var buffer = Buffer.concat(chunks);
var encoding = res.headers['content-encoding'];
if (encoding == 'gzip') {
zlib.gunzip(buffer, function(err, decoded) {
// set response headers and write the response
context.ServerResponse.end(decoded.toString());
});
} else if (encoding == 'deflate') {
zlib.inflate(buffer, function(err, decoded) {
// set response headers and write the response
context.ServerResponse.end(decoded.toString());
})
} else {
// set response headers and write the response
context.ServerResponse.end(buffer.toString());
}
});
});
Simple solution: Return a promise (e.g. use the Q-library) from the VM-script.
script.runInNewContext will return whatever you return from the VM-script. That way you have a "callback" for when the VM code finishes.
// Script for VM
// I simplified it. Just resolve or reject the promise whenever you are done with your work
'use strict';
var defer = q.defer();
doABarrelRoll(function() {
defer.resolve('RESULT');
});
defer.promise; // This line will return the promise.
When returning a value from a VM-script, you do not need any return construction. Just write the thing you want and let the magic happen.
// Script for current context
'use strict';
var server = http.createServer(handleRequest);
server.listen(8080);
var handleRequest = function (request, response) {
// get request data...
var context = {
ServerRequest : request,
ServerResponse : response
};
var stringScript = // a string with the script that posts data
var script = vm.createScript(stringScript);
var prom = script.runInNewContext({
context: context,
q: require('q'),
});
prom.done(function ($result) {
console.log('VM finished with result: ' + $result);
});
}