Unable to fetch results using callback - node.js

I've written a script in node using two different functions getPosts() and getContent() supplying callback within them in order to print the result calling a standalone function getResult(). The selectors defined within my script is flawless.
However, when I execute my script, It prints nothing. It doesn't throw any error either. I tried to mimic the logic provied by Neil in this post.
How can I make it a go?
I've written so far:
var request = require('request');
var cheerio = require('cheerio');
const url = 'https://stackoverflow.com/questions/tagged/web-scraping';
function getPosts(callback){
request(url, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
$('.summary .question-hyperlink').each(function(){
var items = $(this).text();
var links = $(this).attr("href");
callback(items,links);
});
}
});
}
function getContent(item,link,callback){
request(link, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
var proLink = $('.user-details > a').eq(0).attr("href");
callback({item,link,proLink});
}
});
}
function getResult() {
getPosts(function(item,link) {
getContent(item,link,function(output){
console.log(output);
});
});
}
getResult();

The link value that you receive from getPosts is a relative link which means that the request fails. You can extract the hostname inside its own variable and create the full URL from the hostname + the relative link.
const host = 'https://stackoverflow.com';
const url = '/questions/tagged/web-scraping';
// ...
function getContent(item,link,callback){
// Here we use the absolute URL
request(host + link, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
var proLink = $('.user-details > a').eq(0).attr("href");
callback({item,link,proLink});
}
});
}

Related

node.js how to wait for 20 functions

how call a callback function, after all the functions are finished?
All functions must start at the same time, and when all functions finish running, run the callback
function step_one(callback){
parse1site();
parse2site();
parse3site();
parse4site();
parse5site();
parse6site();
parse7site();
parse8site();
parse9site();
parse10site();
parse11site();
parse12site();
parse13site();
parse14site();
parse15site();
parse16site();
parse17site();
parse18site();
parse19site();
parse20site();
}
Example function
function parse1site(){
var URL = "https://site1.com";
needle.get(URL, function(error, response){
if (!error && response.statusCode == 200){
data["site1"] = response.body;
console.log("OK");
} else{
console.log("error");
}
});
}
I would change your usage of needle to the promise API and then use Promise.all
var p1 = needle('get', 'https://server.com/posts/12');
var p2 = needle('get', 'https://server.com/posts/13');
//...
Promise.all([p1,p2]).then((data)=>{
// here you will get the response of all of your requests in array data
});

Calling a module's local function as callback in "request"

In my main code, I do the following:
var module = require('./module')
module.FooA(module.FooB);
module.js contains the next code:
var request = require('request'); //using of npm "request"
exports.FooB = function(data){ /*operations on data here*/ };
exports.FooA = function(callback){
var url = some_link;
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
callback(body);
};
});
};
The issue is that apparently, callback(body) doesn't run even if the conditions meet. var result = request(url) followed by exports.FooB(result) does the job, but as far as I can see, obviously does not act like a callback, and would produce troubles.
What is the proper way of defining a callback function in such a case? Do I need at all, or it is actually synchronous and I missed to notice it?
Use first function callback params with error, this is an default in node.js core and is google for your project functions.
And like #ShanSan commend, use console.log, console.error or console.trace for debug.
Example:
var request = require('request'); //using of npm "request"
exports.FooB = function(error, data){ /*operations on data here*/ };
exports.FooA = function(callback){
var url = some_link;
request(url, function (error, response, body) {
if (error || response.statusCode != 200) {
// pass error to callback and if use return you dont need other code block bellow
console.error('Error in request', error);
return callback(error, null);
}
// here run if dont have errors
// if need more info use the console.log(request); or console.log(body);
// use error in first param in callback functions
callback(null, body);
});
};

Accessing request parameters from the response callback in Request module

How could we access the request parameters from the response callback when using Request module?
For example, the following "dog" (and so on) value could be passed via looping a list:
var u = require('util');
var url = "http://example.com/animals/%s";
request.get({uri: u.format(url, "dog")}, function (error, response, body) {
if (!error && response.statusCode == 200) {
//how could we access the value 'dog' here?
//something like this: console.log(uri.params.animal);
}
}
You can use closure property of js.
var u = require('util');
var url = "http://example.com/animals/%s";
var param = "dog";
request.get({uri: u.format(url, param )}, function (error, response, body) {
if (!error && response.statusCode == 200) {
//how could we access the value 'dog' here?
console.log(param);
}
}

Inconsistent method result in node js

I have been trying to figure the following for the last couple of days and just can't seem to figure out the answer. I am new to node and JS (only experience is online tutorials).
I am trying to create a class (function) to scrape the source code from websites. I want to read in a url from the command line and return the html content. However, I seem to be getting different results when running the code different ways (which I think I should be getting the same results).
I have been reading about events in node and so I have used them a little in the code. One listener event prompts the me for the url and then after setting the url it (the listener function) emits a message, which is picked up by another listener which goes out and fetches the html content.
The problem I am having is that when I create an instance of the object, it seems like the request portion of the code does not execute. However, if I call the method from the instance I get the print out of the html content of the page.
Any help is appreciated. Thanks.
function test() {
var events = require('events').EventEmitter;
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
console.log("Input the URL: ");
process.stdin.resume();
process.stdin.setEncoding('utf8');
process.stdin.on('data', function (text) {
that.url = util.inspect(text);
that.url = that.url.substr(1, that.url.length - 4);
that.eventEmitter.emit('Get url html');
process.exit();
});
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
scrapper.httpGet(); // This gives the desired results from that.httpGet
I solved using the Prompt library https://www.npmjs.com/package/prompt
function test() {
var events = require('events').EventEmitter;
var prompt = require('prompt');
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
prompt.start();
process.stdin.setEncoding('utf8');
prompt.get(['url'], function( err, result ) {
that.url = result.url;
that.eventEmitter.emit('Get url html');
} );
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
// scrapper.httpGet(); // This gives the desired results from that.httpGet
I ran the script from the commandline, input http://www.google.com and it retrieved the results without the additional call to scrapper.httpGet();

node js cheerio xml

I have the below code and it is working fine to get:
<troveUrl>http://trove.nla.gov.au/work/23043869</troveUrl>
But I would like to get the value after 'id' in the following from the same page and cannot get it!
<work id="23043869" url="/work/23043869">
here is the code that i currently have
var request = require ('request'),
cheerio = require ('cheerio');
request('http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&q-year1-date=2000&l-advformat=Thesis&l-australian=y&q-term2=&q-term3=&q-term0=&q-field1=title%3A&q-type2=all&q-field0=&q-term1=&q-type3=all&q-field3=subject%3A&q-type0=all&q-field2=creator%3A&q-type1=all&l-availability=y%2Ff&q=+date%3A[2000+TO+2014]&q-year2-date=2014&n=1', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('troveurl').each(function(i, element){
var id = $(this);
console.log(id.text());
});
}
});
Any assistance appreciated.
You should pass xmlMode: true in the options object, then you can parse it as XML.
You can then grab the tag and data with $('tag').attr('attribute') and $('tag').text() to get the data between the tags as you've done.
var request = require('request'),
cheerio = require('cheerio');
request('http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&q-year1-date=2000&l-advformat=Thesis&l-australian=y&q-term2=&q-term3=&q-term0=&q-field1=title%3A&q-type2=all&q-field0=&q-term1=&q-type3=all&q-field3=subject%3A&q-type0=all&q-field2=creator%3A&q-type1=all&l-availability=y%2Ff&q=+date%3A[2000+TO+2014]&q-year2-date=2014&n=1', function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html, {
xmlMode: true
});
console.log($('work').attr('id'))
}
});
The real issue lies in the syntax you used to get the value after 'id'. The following code will not console.log the id out.
var id = $(this);
console.log(id.text());
The correct syntax should be $('your element').attr('id') like is mentioned in Ben Fortune's answer above. However, passing xmlMode: true in the options object is not a necessity.
Passing xmlMode: false in the options will also work if you are using the correct syntax.

Resources