scraping a page that redirects - node.js

i try to scrape a simple page (require cheerio and request):
https://www.ishares.com/uk/individual/en/products/251824/
The code fails. I believe it is because, in order to get to the above, users are prompted on previous page for "individual" or "institutional" so are being redirected.
I have tried different variations of the url, but all fail.
how can i get the raw HTML using node.js ?
here is the code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio'); // fast flexible implement of jQuery for server.
var fs = require('fs');
var app = express();
var port = 8000;
var timeLog = []; // for dl to measure the time of events.
// var startTime = Date.now();
timeLog[0] = Date.now();
console.log('program initiated at time: '+new Date());
// example 1: pull the webpage and print to console
var url ="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf";
url = "https://www.ishares.com/uk/individual/en/products/251824/";
url="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
request(url,function functionName(err,resp,body) {
var $ = cheerio.load(body);
var distYield = $('.col-distYield');
var distYieldText = distYield.text();
console.log('we got to line 24');
console.log(distYieldText);
timeLog[2] = Date.now();
console.log('data capture time: '+(timeLog[2] - timeLog[0])/1000+' seconds');
if (err) {
console.log(err);
}else {
//console.log(body);
console.log('the body was written: success');
}
});
// example 2: download webpage and save file
var destination = fs.createWriteStream('./downloads/iSharesSEMB.html');
request(url)
.pipe(destination);
// example 3:
var destination = fs.createWriteStream('./downloads/iSharesSEMB2.html');
request(url)
.pipe(destination)
.on("finish",function () {
console.log('done');
})
.on('error',function (err) {
console.log(err);
});
timeLog[1] = Date.now();
console.log('program completed at time: '+new Date());
console.log('Asynchronous program run time: '+(timeLog[1] - timeLog[0])/1000+' seconds');

Alright, I got it to work. I enabled cookie support for request but then got into a redirect loop. Adding a promise worked it out. Here's only the relevant HTML request part:
const request = require('request'),
cheerio = require('cheerio');
const url = "https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
options = {
jar: true
}
const getDistYield = url => {
return new Promise((resolve, reject) => {
request(url, options, function(err,resp,body) {
if (err) reject(err);
let $ = cheerio.load(body);
resolve($('.col-distYield'));
})
})
}
getDistYield(url)
.then((tag) => {
console.log(tag.text())
}).catch((e) => {
console.error(e)
})
Outputs:
Distribution Yield
The distribution yield represents the ratio of distributed income over the last 12 months to the fund’s current NAV.
as of 20-Feb-2018
4.82
Also, notice I've used the last URL you provided.
I hope this works it out for you :)

have amended the resolve part to just get the value (and not the text) which is a nested class.
resolve($('.col-distYield > span:nth-child(2)'));

Related

Send Multiple HTTP Request with Axios

I am trying to crawl this website to get university names using Node JS Axios. I notice that the website uses Paginated API so to crawl all the university name I have to send multiple requests.
const url = 'https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table&_page=1;
const url = 'https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table&_page=2;
const url = 'https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table&_page=3;
...
const url = 'https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table&_page=55;
I have written code to crawl only one page. I do not know how to crawl more than 1 page.
Here is my code
const axios = require('axios');
const cheerio = require('cheerio');
var request = require('request');
fs = require('fs');
_sort=rank&_sortDirection=asc&study=Engineering";
// table view
page= 1;
const url = 'https://www.usnews.com/best-colleges/search?_sort=rank&_sortDirection=asc&study=Engineering&_mode=table&_page=' +page;
fetchData(url).then((res) => {
const html = res.data;
const $ = cheerio.load(html);
const unilist = $('.TableTabular__TableContainer-febmbj-0.guaRKP > tbody > tr >td ');
unilist.each(function() {
let title = $(this).find('div').attr("name");
if (typeof(title) == 'string') {
console.log(title);
fs.appendFileSync('universityRanking.txt', title+'\n', function (err) {
if (err) return console.log(err);
});
}
});
})
async function fetchData(url){
console.log("Crawling data...")
// make http call to url
let response = await axios(url).catch((err) => console.log(err));
if(response.status !== 200){
console.log("Error occurred while fetching data");
return;
}
return response;
}
I would like help on how to make 55 Axios requests? I checked that the page has 55 pages. I need to append all the university name from each page to a text file.
The axios.all() method can help your use case.
axios.all([]) // Pass the array of axios requests for all the 55 pages here
.then({
// Multiple requests complete
});

Get express/node to loop through request sent to NOAA API

So I am making a kind of API middleware for my company that will grab information from the NOAA API and then store in in my database. It does more then but that a separate part. I have set it up so that it works it will get the information and store it in my sql database perfectly The issue is the information I get is based off of zipcode. One request is the information for one zipcode. I need to be able to 'loop" through a list of zipcode one at a time and store the information in the database. I am not sure how to properly get it to work. I have tested a couple of ways but have not been able to get it to work so if someone can get me pointed in the right direction it would be appreciated.
Sorry in advance my code is not cleaned up.
Everything below apiRequest.end() has little function for the question. I keep it for context.
let mysql = require('mysql');
let config = require('./config.js');
var https = require("https");
var express = require("express");
var app = express();
const port = 3000;
var fs= require('fs');
var csv = require('fast-csv');
//last test
//array will replace this zip variable
let zip = '90012';
api(zip);
function api(zips){
//All of the parts for building the get requests url
app.get("/", function(req, response) {
var apiKey = "gPaEVizejLlbRVbXexyWtXYkfkWkoBhd";
let webapi = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?';
let datasetid="datasetid=GHCND";
let datatypeid="&datatypeid=TMAX";
let location="&locationid=ZIP:";
const zipcode = zips;
let startdate="&startdate=2019-01-01";
let enddate="&enddate=2020-01-01";
let units = "&units=standard";
let limit="&limit=1000";
let url = webapi + datasetid + datatypeid + location + zipcode + startdate + enddate + units + limit;
var options = {
port: 443,
method: "GET",
headers: {
"token": apiKey
}
};
let data = "";
//request to grab from NOAA api
let apiRequest = https.request(url, options, function(res) {
console.log("Connected");
//grabing all data
res.on("data", chunk => {
data += chunk;
});
res.on("end", () => {
console.log("data collected");
//Format JSON data
response.send(JSON.parse(data));
var getData = JSON.parse(data);
if(isEmpty(getData)){
emptyCorrect();
}
dataFormat(getData);
});
});
apiRequest.end();
});
//fix format for date Can add more formating if needed here
function dataFormat(formData){
for(x in formData.results){
let date = formData.results[x].date;
formData.results[x].date = date.slice(0,10);
}
jsonToSQL(formData.results);
}
//test function is going to be used for inserting the zip
function test(){
var content = "";
console.log("your test worked see ***************");
return "92507";
}
//function to add grabed JSON data into the SQL database
function jsonToSQL(datafin){
var zipcode = zips;
let connection = mysql.createConnection(config);
// insert statment
let stmt = `INSERT INTO test1(ZIPCODE,DATE, TEMP) VALUES ? `;
let values = [];
for(let x in datafin){
values.push([zipcode,datafin[x].date,datafin[x].value]);
}
// execute the insert statment
connection.query(stmt, [values], (err, results, fields) => {
if (err) {
return console.error("error");
}
// get inserted rows
console.log('Row inserted:' + results.affectedRows);
});
// close the database connection
connection.end();
}
function emptyCorrect(){
console.log("Eror correction");
var zipcode = zips;
let connection = mysql.createConnection(config);
// insert statment
let stmt = `INSERT INTO test1(ZIPCODE,DATE, TEMP) VALUES ? `;
let valueE = [];
valueE.push([zipcode,"0","No Data"]);
// execute the insert statment
connection.query(stmt, [valueE], (err, results, fields) => {
if (err) {
return console.error("error");
}
// get inserted rows
console.log('Row inserted:' + results.affectedRows);
});
// close the database connection
connection.end();
}
function isEmpty(obj) {
for(var key in obj) {
if(obj.hasOwnProperty(key))
return false;
}
return true;
}
app.listen(port, () => console.log(`Example app listening on port ${port}!`))
}
As I understand your problem can roughly be summarized as "How to loop through asynchronous evaluations in Nodejs".
There are some options for you. I would recommend wrapping call to the NOAA API with a promise and then chain those promises. This can be done as follows:
app.get('/', async function(req, response) {
var apiKey = 'some value';
let webapi = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data?';
let datasetid = 'datasetid=GHCND';
let datatypeid = '&datatypeid=TMAX';
let location = '&locationid=ZIP:';
let startdate = '&startdate=2019-01-01';
let enddate = '&enddate=2020-01-01';
let units = '&units=standard';
let limit = '&limit=1000';
var options = {
port: 443,
method: 'GET',
headers: {
token: apiKey
}
};
const zipCodes = ['90012', '90013']; // Place a call to your function for fetching zip codes here
let datas = [];
prom = Promise.resolve();
zipCodes.forEach(zipcode => {
prom = prom.then(() =>
new Promise((resolve, reject) => {
let url =
webapi +
datasetid +
datatypeid +
location +
zipcode +
startdate +
enddate +
units +
limit;
let apiRequest = https.request(url, options, function(res) {
console.log('Connected');
let data = '';
res.on('data', chunk => {
data += chunk;
});
res.on('end', () => {
console.log('data collected for zip ' + zipcode);
datas.push(data);
resolve();
});
});
apiRequest.end();
})
);
});
prom.then(() => {
// All requests have now been handled sequentially
response.send(/* You'll need to figure out what to do here */);
});
});
An alternative is to use something like the async library for dealing with sequentially calling callbacks. The async library (https://github.com/caolan/async) describes itself as:
Async is a utility module which provides straight-forward, powerful functions for working with asynchronous JavaScript.
See e.g. Node.js: How do you handle callbacks in a loop? for a similar problem (not with regards to callign an API, but dealing with asynchronous function in a loop).

Requests suddenly not working despite no apparent change from scraping source or code

I am currently trying to get better at scraping in JS and use request and cheerio. About two weeks ago I got a basic amazon scrape to work but this morning when I loaded my files it's no longer working. I made sure Cheerio and Request was installed on node and tried picking up requests from wikipedia and it worked fine. On Amazon my original source the code no longer works. Nothing on their webpage seems to have changed so I have no clue why none of my targets are working.
const request = require('request');
const cheerio = require('cheerio');
request(`http://amazon.com/dp/B07R7DY911`, (error,response,html) =>{
if (!error && response.statusCode ==200) {
const $ = cheerio.load(html);
const productTitle = $("#productTitle").html()
const price = $("#priceblock_ourprice").text();
const rating = $('#centerCol #acrPopover').text().replace(/\s\s+/g, '');
const numReviews = $('#centerCol #acrCustomerReviewText').text().replace(/\s\s+/g, '');
const prodImg = $('#landingImage').attr('data-old-hires');
console.log(productTitle);
console.log(price);
console.log(rating);
console.log(numReviews);
console.log(prodImg)
} else {
console.log(error);
}
})
Some playing around and I get null and undefined where I simply didn't before.
Help me stack overflow. You're my only hope!
Update:
Switched code to axios. Much better now.
app.get("/",(req,res)=>{
axios.get(`${link}`)
.then((response)=> {
const html = response.data;
const $ = cheerio.load(html);
const productName = $("#productTitle").html().replace(/\s\s+/g, '');
const amznPrice = $("#priceblock_ourprice").text();
const rating = $('#centerCol #acrPopover').text().replace(/\s\s+/g, '');
const numReviews = $('#centerCol #acrCustomerReviewText').text().replace(/\s\s+/g, '');
const prodImg = $('#landingImage').attr('data-old-hires');
res.render("home", {
productTitle: productName,
price:amznPrice,
prod_Img:prodImg,
azLink:links,
});
});
});
It appears that you're getting a compressed output in a format that the request() library does not understand. If you add the gzip: true option in the request() call, then the code starts working for me.
const request = require('request');
const cheerio = require('cheerio');
request({url: 'http://amazon.com/dp/B07R7DY911', gzip: true}, (error,response,html) => {
if (!error && response.statusCode == 200) {
const $ = cheerio.load(html);
const productTitle = $("#productTitle").html()
const price = $("#priceblock_ourprice").text();
const rating = $('#centerCol #acrPopover').text().replace(/\s\s+/g, '');
const numReviews = $('#centerCol #acrCustomerReviewText').text().replace(/\s\s+/g, '');
const prodImg = $('#landingImage').attr('data-old-hires');
console.log("productTitle", productTitle);
console.log("price", price);
console.log("rating", rating);
console.log("numReviews", numReviews);
console.log("prodImg", prodImg)
} else {
console.log(error);
}
});

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

nodejs request running last

I am trying to figure out how to make the callback function in request run in order. Currently, my loop runs 10 times but does not wait for the callback function in request to finish before moving to the next iteration. My output is nothing like what I'd expect it to be and I'm not sure why certain things are being printed before others. Here is how my code is as of now:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var app = express();
var url;
for(var i=0; i < 10; i++ ){
url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
request(url, function(err, resp, body){
console.log("hello");
var $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
return;
}else{
console.log($('.error').text().substring(0, 14) );
var pfname = $('.pfname');
var plname = $('.plname');
var professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
console.log(url);
return;
}
});
}
Here is the output I am getting:
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
sorry page not found
hello
Michael Beeson
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
hello
Sami Khuri
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
hello
aaa aaa
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=9
Here is the proper output:
aaa aaa
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=1
Sami Khuri
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=2
Michael Beeson
http://www.ratemyprofessors.com/ShowRatings.jsp?tid=3
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found
sorry page not found
There are multiple issues in your code, but the main issue is that you're running an async operation inside the for loop so your for loop will start all the async operations and then they will, one-by-one complete later. Any variables shared between the loop invocations will tromp one another.
So, in a nutshell, I did:
Removed all shared variables so each loop invocation has its own variables (no conflicts).
Switched over to request-promise so we can use Promise.all() to more easily tell us when they are all done.
Returned the value we want from each .then() handler so that will be collected by Promise.all() as the final values for each invocation of the loop.
Because there appears to be no reason to sequence your operations, I let them all run in a parallel (that's faster) and then let Promise.all() put the results in order for us in the final array of results.
Here's the code:
const express = require('express');
const path = require('path');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
const app = express();
let promises = [];
for (let i = 0; i < 10; i++ ) {
let url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
promises.push(rp(url).then(function(body) {
console.log(url);
let $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
return null;
} else {
console.log($('.error').text().substring(0, 14) );
let pfname = $('.pfname');
let plname = $('.plname');
let professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
return professorName;
}
}));
}
// see when they are all done
Promise.all(promises).then(results => {
// array of results, some entries that were not found may be null
console.log(results);
}).catch(err => {
console.log(err);
});
If you want to sequence them one at a time so the second request doesn't start until the first one is done, that could be done like this using async/await:
const express = require('express');
const path = require('path');
const rp = require('request-promise');
const cheerio = require('cheerio');
const fs = require('fs');
const app = express();
async function run() {
let results = [];
for (let i = 0; i < 10; i++ ) {
let url = "http://www.ratemyprofessors.com/ShowRatings.jsp?tid=" + i;
try {
let body = await rp(url);
console.log("hello");
let $ = cheerio.load(body);
if($('.error').text().substring(0, 14) == "Page Not Found"){
console.log("sorry page not found");
results.push(null);
} else {
console.log($('.error').text().substring(0, 14) );
let pfname = $('.pfname');
let plname = $('.plname');
let professorName = pfname.text().replace(/\s/g, '') + " " +plname.text().replace(/\s/g, '');
console.log(professorName);
console.log(url);
results.push(professorName);
}
} catch(e) {
console.log(url, e);
results.push(null);
}
}
return results;
}
run().then(results => {
console.log(results);
}).catch(err => {
console.log(err);
});
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.

Resources