I've written a script in node using two different functions getPosts() and getContent() supplying callback within them in order to print the result calling a standalone function getResult(). The selectors defined within my script is flawless.
However, when I execute my script, It prints nothing. It doesn't throw any error either. I tried to mimic the logic provied by Neil in this post.
How can I make it a go?
I've written so far:
var request = require('request');
var cheerio = require('cheerio');
const url = 'https://stackoverflow.com/questions/tagged/web-scraping';
function getPosts(callback){
request(url, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
$('.summary .question-hyperlink').each(function(){
var items = $(this).text();
var links = $(this).attr("href");
callback(items,links);
});
}
});
}
function getContent(item,link,callback){
request(link, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
var proLink = $('.user-details > a').eq(0).attr("href");
callback({item,link,proLink});
}
});
}
function getResult() {
getPosts(function(item,link) {
getContent(item,link,function(output){
console.log(output);
});
});
}
getResult();
The link value that you receive from getPosts is a relative link which means that the request fails. You can extract the hostname inside its own variable and create the full URL from the hostname + the relative link.
const host = 'https://stackoverflow.com';
const url = '/questions/tagged/web-scraping';
// ...
function getContent(item,link,callback){
// Here we use the absolute URL
request(host + link, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
var proLink = $('.user-details > a').eq(0).attr("href");
callback({item,link,proLink});
}
});
}
Related
how call a callback function, after all the functions are finished?
All functions must start at the same time, and when all functions finish running, run the callback
function step_one(callback){
parse1site();
parse2site();
parse3site();
parse4site();
parse5site();
parse6site();
parse7site();
parse8site();
parse9site();
parse10site();
parse11site();
parse12site();
parse13site();
parse14site();
parse15site();
parse16site();
parse17site();
parse18site();
parse19site();
parse20site();
}
Example function
function parse1site(){
var URL = "https://site1.com";
needle.get(URL, function(error, response){
if (!error && response.statusCode == 200){
data["site1"] = response.body;
console.log("OK");
} else{
console.log("error");
}
});
}
I would change your usage of needle to the promise API and then use Promise.all
var p1 = needle('get', 'https://server.com/posts/12');
var p2 = needle('get', 'https://server.com/posts/13');
//...
Promise.all([p1,p2]).then((data)=>{
// here you will get the response of all of your requests in array data
});
In my main code, I do the following:
var module = require('./module')
module.FooA(module.FooB);
module.js contains the next code:
var request = require('request'); //using of npm "request"
exports.FooB = function(data){ /*operations on data here*/ };
exports.FooA = function(callback){
var url = some_link;
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
callback(body);
};
});
};
The issue is that apparently, callback(body) doesn't run even if the conditions meet. var result = request(url) followed by exports.FooB(result) does the job, but as far as I can see, obviously does not act like a callback, and would produce troubles.
What is the proper way of defining a callback function in such a case? Do I need at all, or it is actually synchronous and I missed to notice it?
Use first function callback params with error, this is an default in node.js core and is google for your project functions.
And like #ShanSan commend, use console.log, console.error or console.trace for debug.
Example:
var request = require('request'); //using of npm "request"
exports.FooB = function(error, data){ /*operations on data here*/ };
exports.FooA = function(callback){
var url = some_link;
request(url, function (error, response, body) {
if (error || response.statusCode != 200) {
// pass error to callback and if use return you dont need other code block bellow
console.error('Error in request', error);
return callback(error, null);
}
// here run if dont have errors
// if need more info use the console.log(request); or console.log(body);
// use error in first param in callback functions
callback(null, body);
});
};
How could we access the request parameters from the response callback when using Request module?
For example, the following "dog" (and so on) value could be passed via looping a list:
var u = require('util');
var url = "http://example.com/animals/%s";
request.get({uri: u.format(url, "dog")}, function (error, response, body) {
if (!error && response.statusCode == 200) {
//how could we access the value 'dog' here?
//something like this: console.log(uri.params.animal);
}
}
You can use closure property of js.
var u = require('util');
var url = "http://example.com/animals/%s";
var param = "dog";
request.get({uri: u.format(url, param )}, function (error, response, body) {
if (!error && response.statusCode == 200) {
//how could we access the value 'dog' here?
console.log(param);
}
}
I have been trying to figure the following for the last couple of days and just can't seem to figure out the answer. I am new to node and JS (only experience is online tutorials).
I am trying to create a class (function) to scrape the source code from websites. I want to read in a url from the command line and return the html content. However, I seem to be getting different results when running the code different ways (which I think I should be getting the same results).
I have been reading about events in node and so I have used them a little in the code. One listener event prompts the me for the url and then after setting the url it (the listener function) emits a message, which is picked up by another listener which goes out and fetches the html content.
The problem I am having is that when I create an instance of the object, it seems like the request portion of the code does not execute. However, if I call the method from the instance I get the print out of the html content of the page.
Any help is appreciated. Thanks.
function test() {
var events = require('events').EventEmitter;
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
console.log("Input the URL: ");
process.stdin.resume();
process.stdin.setEncoding('utf8');
process.stdin.on('data', function (text) {
that.url = util.inspect(text);
that.url = that.url.substr(1, that.url.length - 4);
that.eventEmitter.emit('Get url html');
process.exit();
});
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
scrapper.httpGet(); // This gives the desired results from that.httpGet
I solved using the Prompt library https://www.npmjs.com/package/prompt
function test() {
var events = require('events').EventEmitter;
var prompt = require('prompt');
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
prompt.start();
process.stdin.setEncoding('utf8');
prompt.get(['url'], function( err, result ) {
that.url = result.url;
that.eventEmitter.emit('Get url html');
} );
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
// scrapper.httpGet(); // This gives the desired results from that.httpGet
I ran the script from the commandline, input http://www.google.com and it retrieved the results without the additional call to scrapper.httpGet();
I have the below code and it is working fine to get:
<troveUrl>http://trove.nla.gov.au/work/23043869</troveUrl>
But I would like to get the value after 'id' in the following from the same page and cannot get it!
<work id="23043869" url="/work/23043869">
here is the code that i currently have
var request = require ('request'),
cheerio = require ('cheerio');
request('http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&q-year1-date=2000&l-advformat=Thesis&l-australian=y&q-term2=&q-term3=&q-term0=&q-field1=title%3A&q-type2=all&q-field0=&q-term1=&q-type3=all&q-field3=subject%3A&q-type0=all&q-field2=creator%3A&q-type1=all&l-availability=y%2Ff&q=+date%3A[2000+TO+2014]&q-year2-date=2014&n=1', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('troveurl').each(function(i, element){
var id = $(this);
console.log(id.text());
});
}
});
Any assistance appreciated.
You should pass xmlMode: true in the options object, then you can parse it as XML.
You can then grab the tag and data with $('tag').attr('attribute') and $('tag').text() to get the data between the tags as you've done.
var request = require('request'),
cheerio = require('cheerio');
request('http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&q-year1-date=2000&l-advformat=Thesis&l-australian=y&q-term2=&q-term3=&q-term0=&q-field1=title%3A&q-type2=all&q-field0=&q-term1=&q-type3=all&q-field3=subject%3A&q-type0=all&q-field2=creator%3A&q-type1=all&l-availability=y%2Ff&q=+date%3A[2000+TO+2014]&q-year2-date=2014&n=1', function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html, {
xmlMode: true
});
console.log($('work').attr('id'))
}
});
The real issue lies in the syntax you used to get the value after 'id'. The following code will not console.log the id out.
var id = $(this);
console.log(id.text());
The correct syntax should be $('your element').attr('id') like is mentioned in Ben Fortune's answer above. However, passing xmlMode: true in the options object is not a necessity.
Passing xmlMode: false in the options will also work if you are using the correct syntax.