Can't get text from a div - node.js

I want to get the content of the div mw-content-text from some wikipedia page (this is just examples to learn node.js) I have made this:
var fetch = require('node-fetch');
var cheerio = require('cheerio');
var fs = require('fs');
var vv = [
'https://en.wikipedia.org/wiki/Ben_Silbermann',
'https://en.wikipedia.org/wiki/List_of_Internet_entrepreneurs'
];
var bo=[],
$;
vv.forEach((t)=>{
fetch(t)
.then(res => res.text())
.then((body) => {
$ = cheerio.load(body);
var finded = $('#mw-content-text').text();
bo.push(finded);
});
});
console.log(bo);
If I output body, it is filled with a string containing the whole html page (so, this step is ok),
If I output $ it contains a collection (but I'm not sure if it's populated, I use the node.js command prompt but it looks that it's not the right tool, any advice on that too?)
Anyway, variable bo returns me an empty array

The issue here is that we're logging bo before the fetch call is complete. I'd suggest using the async/await syntax to ensure we wait for all the gets to return, then we can log the result.
You could follow with some more processing like removing empty lines, whitespace etc, but that shouldn't be too hard.
var fetch = require('node-fetch');
var cheerio = require('cheerio');
var vv = [
'https://en.wikipedia.org/wiki/Ben_Silbermann',
'https://en.wikipedia.org/wiki/List_of_Internet_entrepreneurs'
];
async function getDivcontent() {
const promises = vv.map(async t => {
const body = await fetch(t).then(res => res.text());
const $ = cheerio.load(body);
return $('#mw-content-text').text();
});
return await Promise.all(promises);
}
async function test() {
let result = await getDivcontent();
console.log("Result:" + result);
}
test();

Related

Scrape single column from table on Wikipedia using NodeJS Cheerio

I am trying to scrape a column from the table located on Wikipedia. I am trying to get the first column, Symbol, and use those symbols in an array in NodeJS. I attempted to scrape only this column using Cheerio and Axios. For some reason, after running the function I do not get any syntax errors but I also do not get any result after execution. I'm not sure if the elements I have loaded are correct or not, but any advice on how I can scrape the Symbol column into an array would be helpful. Below is my code:
const express = require('express');
const app = express();
const http = require('http');
const server = http.createServer(app);
const cheerio = require('cheerio');
const axios = require("axios");
async function read_fortune_500() {
try {
const url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
const { data } = await axios({
method: "GET",
url: url,
})
const $ = cheerio.load(data)
const elemSelector = '#constituents > tbody > tr:nth-child(1) > td'
$(elemSelector).each((parentIndex, parentElem) => {
if (parentIndex <= 9){
$(parentElem).children().each((childIndex, childElem) => {
console.log($(childElem).text())
})
}
})
} catch (err) {
console.error(err)
}
}
read_fortune_500()
Result
[Finished in 1.238s]
To help with your original issue:
For some reason, after running the function I do not get any syntax
errors but I also do not get any result after execution.
The reason for this is that you are calling an async function in javascript. Because read_fortune_500 has the async keyword, you need to 'wait' for this function to complete. In javascript world, the read_fortune_500 is actually returning a promise and you need to wait until the promise resolves. You can do that in a couple of ways:
The easiest way to handle this is tp wrap your function call inside an IIFE:
(async () => {
await read_fortune_500()
})();
In future versions of node you can use await without the need for wrapping it but hopefully that helps.
For the second issue, getting a list of symbols. You need to update the query selector you are using:
const $ = cheerio.load(data)
const elements = $("#constituents > tbody > tr > td:nth-child(1)")
elements.each((parentIndex, parentElem) => {
The CSS selector is slightly different but the selector above tells cheerio to look inside each table row in the DOM and then select the first column in that row.
Full working code below:
const cheerio = require('cheerio');
const axios = require("axios");
async function read_fortune_500() {
try {
const url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
const { data } = await axios({
method: "GET",
url: url,
})
const $ = cheerio.load(data)
const elements = $("#constituents > tbody > tr > td:nth-child(1)")
elements.each((parentIndex, parentElem) => {
if (parentIndex <= 9){
$(parentElem).children().each((childIndex, childElem) => {
console.log($(childElem).text())
})
}
})
} catch (err) {
console.error(err)
}
}
(async () => {
await read_fortune_500()
})();

Puppeteer : use second match with page.evaluate

i'm using puppeteer to retrieve datas online, and facing an issue.
Two functions have the same name and return serialized object, the first one returns an empty object, but the second one does contains the datas i'm targeting.
My question is, how can I proceed to select the second occurence of the function instead of the first one, which return an empty object.
Thanks.
My code :
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const Variants = require('./variants.js');
const Feedback = require('./feedback.js');
async function Scraper(productId, feedbackLimit) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
/** Scrape page for details */
await page.goto(`${productId}`);
const data = (await page.evaluate()).match(/window.runParams = {"result/)
const data = data.items
await page.close();
await browser.close();
console.log(data);
return data;
}
module.exports = Scraper;
Website source code :
window.runParams = {};
window.runParams = {"resultCount":19449,"seoFeaturedSnippet":};
Please try this, it should work.
const data = await page.content();
const regexp = /window.runParams/g;
const matches = string.matchAll(regexp);
for (const match of matches) {
console.log(match);
console.log(match.index)
}

All my scraped text ends up in one big object instead of separate objects with Cheerio

I'm following a web scraping course that uses Cheerio. I practice on a different website then they use in the course and now I run into the problem that all my scraped text end up in one big object. But every title should end up in it's own object. Can someone see what I did wrong? I already bumbed my head 2 hours on this problem.
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.aanbod").each((index, element) => {
const result = $(element).children(".item");
const title = result.find("h2").text().trim();
const characteristics = result.find("h4").text();
const scrapeResult = {title, characteristics};
scrapeResults.push(scrapeResult);
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();
This is the link to the repo: https://github.com/danielkroon/huurgoed-scraper/blob/master/index.js
Thanks!
That is because of the way you used selectors. I've modified your script to fetch the content as you expected. Currently the script is collecting titles and characteristics. Feel free to add the rest within your script.
This is how you can get the required output:
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.item").each((index, element) => {
const title = $(element).find(".kenmerken > h2").text().trim();
const characteristics = $(element).find("h4").text().trim();
scrapeResults.push({title,characteristics});
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

nodejs request running last

I am trying to figure out how to make the callback function in request run in order. Currently, my loop runs 10 times but does not wait for the callback function in request to finish before moving to the next iteration. My output is nothing like what I'd expect it to be and I'm not sure why certain things are being printed before others. Here is how my code is as of now:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var url;
for(var i=0; i < 10; i++ ){
url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
request(url, function(err, resp, body){
console.log("hello");
var $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
return;
}else{
console.log($('.error').text().substring(0, 14) );
var pfname = $('.pfname');
var plname = $('.plname');
var professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
console.log(url);
return;
}
});
}
Here is the output I am getting:
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
Michael Beeson
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
hello
Sami Khuri
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
hello
aaa aaa
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
Here is the proper output:
aaa aaa
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1
Sami Khuri
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=2
Michael Beeson
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=3
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found
There are multiple issues in your code, but the main issue is that you're running an async operation inside the for loop so your for loop will start all the async operations and then they will, one-by-one complete later. Any variables shared between the loop invocations will tromp one another.
So, in a nutshell, I did:
Removed all shared variables so each loop invocation has its own variables (no conflicts).
Switched over to request-promise so we can use Promise.all() to more easily tell us when they are all done.
Returned the value we want from each .then() handler so that will be collected by Promise.all() as the final values for each invocation of the loop.
Because there appears to be no reason to sequence your operations, I let them all run in a parallel (that's faster) and then let Promise.all() put the results in order for us in the final array of results.
Here's the code:
const express = require('express');
const path = require('path');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
const app = express();
let promises = [];
for (let i = 0; i < 10; i++ ) {
let url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
promises.push(rp(url).then(function(body) {
console.log(url);
let $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
return null;
} else {
console.log($('.error').text().substring(0, 14) );
let pfname = $('.pfname');
let plname = $('.plname');
let professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
return professorName;
}
}));
}
// see when they are all done
Promise.all(promises).then(results => {
// array of results, some entries that were not found may be null
console.log(results);
}).catch(err => {
console.log(err);
});
If you want to sequence them one at a time so the second request doesn't start until the first one is done, that could be done like this using async/await:
const express = require('express');
const path = require('path');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
const app = express();
async function run() {
let results = [];
for (let i = 0; i < 10; i++ ) {
let url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
try {
let body = await rp(url);
console.log("hello");
let $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
results.push(null);
} else {
console.log($('.error').text().substring(0, 14) );
let pfname = $('.pfname');
let plname = $('.plname');
let professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
console.log(url);
results.push(professorName);
}
} catch(e) {
console.log(url, e);
results.push(null);
}
}
return results;
}
run().then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.

Resources