WebScraping & web navigation simulation - node.js

I'm make a webscraper and I already know how to scrap some data and convert them to Json with this code I made :
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var url = 'http://www.footmercato.net/';
request(url, function(err, resp, body) {
if (!err) {
var $ = cheerio.load(body);
var data = [];
var i = 1;
$('.text').each(function(i, element) {
var article = $('p');
var jsObject = { title : "", article : "", date : "" };
var articleTxt = article.text();
jsObject.article = articleTxt;
data.push(jsObject);
})
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
})
}
});
app.listen('8080');
But I would like to navigate to the website I'm scraping, fill out form and going to others pages.
Does somebody know if i can do it with cheerio or how I can add it to my existing code ?
Thanks

You can use webdriverio actually he will open a browser window, and then you can manipulate the dom through the webdriverio api to handle forms mouse clicks, and navigate from one page to an other.
var webdriverio = require('webdriverio');
var options = {
desiredCapabilities: {
browserName: 'firefox'
}
};
webdriverio
.remote(options)
.init()
.url('http://www.google.com')
.getTitle().then(function(title) {
console.log('Title was: ' + title);
})
.end();

Related

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

scraping a page that redirects

i try to scrape a simple page (require cheerio and request):
https://www.ishares.com/uk/individual/en/products/251824/
The code fails. I believe it is because, in order to get to the above, users are prompted on previous page for "individual" or "institutional" so are being redirected.
I have tried different variations of the url, but all fail.
how can i get the raw HTML using node.js ?
here is the code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio'); // fast flexible implement of jQuery for server.
var fs = require('fs');
var app = express();
var port = 8000;
var timeLog = []; // for dl to measure the time of events.
// var startTime = Date.now();
timeLog[0] = Date.now();
console.log('program initiated at time: '+new Date());
// example 1: pull the webpage and print to console
var url ="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf";
url = "https://www.ishares.com/uk/individual/en/products/251824/";
url="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
request(url,function functionName(err,resp,body) {
var $ = cheerio.load(body);
var distYield = $('.col-distYield');
var distYieldText = distYield.text();
console.log('we got to line 24');
console.log(distYieldText);
timeLog[2] = Date.now();
console.log('data capture time: '+(timeLog[2] - timeLog[0])/1000+' seconds');
if (err) {
console.log(err);
}else {
//console.log(body);
console.log('the body was written: success');
}
});
// example 2: download webpage and save file
var destination = fs.createWriteStream('./downloads/iSharesSEMB.html');
request(url)
.pipe(destination);
// example 3:
var destination = fs.createWriteStream('./downloads/iSharesSEMB2.html');
request(url)
.pipe(destination)
.on("finish",function () {
console.log('done');
})
.on('error',function (err) {
console.log(err);
});
timeLog[1] = Date.now();
console.log('program completed at time: '+new Date());
console.log('Asynchronous program run time: '+(timeLog[1] - timeLog[0])/1000+' seconds');
Alright, I got it to work. I enabled cookie support for request but then got into a redirect loop. Adding a promise worked it out. Here's only the relevant HTML request part:
const request = require('request'),
cheerio = require('cheerio');
const url = "https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
options = {
jar: true
}
const getDistYield = url => {
return new Promise((resolve, reject) => {
request(url, options, function(err,resp,body) {
if (err) reject(err);
let $ = cheerio.load(body);
resolve($('.col-distYield'));
})
})
}
getDistYield(url)
.then((tag) => {
console.log(tag.text())
}).catch((e) => {
console.error(e)
})
Outputs:
Distribution Yield
The distribution yield represents the ratio of distributed income over the last 12 months to the fund’s current NAV.
as of 20-Feb-2018
4.82
Also, notice I've used the last URL you provided.
I hope this works it out for you :)
have amended the resolve part to just get the value (and not the text) which is a nested class.
resolve($('.col-distYield > span:nth-child(2)'));

How to concurrent download files using cheerio and nodejs?

I have a website with multiple pages, each page lists download links which I want to scrap and download.
I have few issues with it:
My script only downloads about 4-5 files and getting stuck.
I would like to concurrently download as much files as my CPU can.
I got stuck with maximum event emitters, I don't understand why is that so I just go
How to follow redirects purely using request module (without follow-redirects)?
How to download the file like the browser does without mentioning it's name? there is no content-disposition but I think the browser follow redirects and the redirected URL has the filename in it's path.
My current code looks like so:
var request = require('request');
var cheerio = require('cheerio');
var https = require('follow-redirects').https;
require('events').EventEmitter.prototype._maxListeners = 1000;
for(var i = 1; i <= 10000; i++) {
(function(i){
url = 'http://mywebsite.com/files?page=' + i;
request(url, gotHTML)
})(i);
}
function gotHTML(err, resp, html) {
var $ = cheerio.load(html);
$('.file-header').each(function() {
var data = $(this);
var fileLink = data.children().first().children().first().attr('href');
var fileName = fileLink.substring(10);
var downloadLink = 'https://mywebsite.com/api/download/' + fileName;
download(downloadLink, function() {
console.log('downloaded');
})
})
}
function download(url, cb) {
var request = https.get(url, function(response) {
var location = request.res.headers.location;
console.log(location);
location = location.split('/').pop();
console.log(location);
var file = fs.createWriteStream(location);
response.pipe(file);
file.on('finish', function() {
file.close(cb);
});
});
}
The default HTTP/HTTPS Agent only uses a maximum of 5 sockets (maxSockets) for requests to the same origin. So this could be causing some issues for you.
Try changing this:
var request = https.get(url, function(response) {
to this:
var options = require('url').parse(url);
options.agent = false; // or use a custom https.Agent with a higher `maxSockets`
var request = https.get(options, function(response) {

Node appendFile not appending data chunk to file on filesystem

I have a program that is trying to get the values from the request using curl and store them in a file and serve the stored content back. The decision to store or append the contents in file are based on a query parameter appendFlag
Now when i run this program what i am getting in console is "true" and "appending" This suggests that it indeed reads the flag goes to the if part but somehow the appendFile function is not working.
var http = require('http');
var url = require('url');
var querystring = require('querystring');
var fs = require('fs');
http.createServer(function(request,response){
var str = request.url.split('?')[1];
var query = querystring.parse(str);
var writeStream = fs.createWriteStream(query['fileName']);
console.log("query - ");
console.log(query["appendFlag"]);
request.on('data',function(chunk){
if(query["appendFlag"]=="true"){
console.log("appending");
fs.appendFile(query['fileName'],chunk.toString(),function(err){
if(err) throw err;
});
}else{
var bufferGood = writeStream.write(chunk);
if(!bufferGood) request.pause();
}
});
request.on('end',function(){
response.writeHead(200);
response.write("\n Content with this url is - \n");
var readStream = fs.createReadStream(query['fileName'],{bufferSize:64*1024});
readStream.on('data',function(chunk){
response.write(chunk.toString());
});
readStream.on('end',function(){
response.write("\n");
response.end();
});
});
writeStream.on('drain',function(){
request.resume();
});
}).listen(8080);
Then after reading an answer from SO( How to create appending writeStream in Node.js ) i tried -
// Program to extract url from the request and writing in that particular file
var http = require('http');
var url = require('url');
var querystring = require('querystring');
var fs = require('fs');
http.createServer(function(request,response){
var str = request.url.split('?')[1];
var query = querystring.parse(str);
var writeStream = fs.createWriteStream(query['fileName']);
options = {
'flags': 'a' ,
'encoding' : null,
'mode' : 0666
}
var appendStream = fs.createWriteStream(query['fileName'],[options]);
console.log("query - ");
console.log(query["appendFlag"]);
request.on('data',function(chunk){
if(query["appendFlag"]=="true"){
console.log("appending");
var bufferGood = appendStream.write(chunk.toString());
if(!bufferGood) request.pause();
}else{
var bufferGood = writeStream.write(chunk.toString());
if(!bufferGood) request.pause();
}
});
request.on('end',function(){
response.writeHead(200);
response.write("\n Content with this url is - \n");
var readStream = fs.createReadStream(query['fileName'],{bufferSize:64*1024});
readStream.on('data',function(chunk){
response.write(chunk.toString());
});
readStream.on('end',function(){
response.write("\n");
response.end();
});
});
writeStream.on('drain',function(){
request.resume();
});
}).listen(8080);
That is changed the flag to the 'a' and it also did not append the data?
Your can use your first variant. But before appendFile() you've opened writeStream for the same query["filename"]. The stream is already opened.
var writeStream = fs.createWriteStream(query['fileName']);
options = {
'flags': 'a' ,
'encoding' : null,
'mode' : 0666
}
var appendStream = fs.createWriteStream(query['fileName'],[options]);
May be it's better to do something like:
var options = {
flags: query.appendFile ? 'w' : 'a'
...
Next: why [options]? You should remove the brackets.
Next: there is no guarantee you'll have filename param in querystring. Please handle this situation.

how to created array global? in nodejs

do i need to keep each link in a array within "request({..})", and then display it or work on the outside of "request({..})", this would be my code but does not work, any idea?
var request = require("request");
var cheerio = require("cheerio");
var arrayLinks = [];
request({
uri: "http://www.some-url.com",
}, function(error, response, body) {
var $ = cheerio.load(body);
$("a").each(function() {
var link = $(this);
arrayLinks.push(link.attr("href"));
});
});
arrayLinks.forEach(function(link){console.log(link)});
For example:
var request = require("request");
var cheerio = require("cheerio");
var arrayLinks = [];
request({
uri: "http://www.some-url.com",
}, function(error, response, body) {
// Some logic.
linkTheArray()
});
function linkTheArray() {
arrayLinks.forEach(function(link){console.log(link)});
}
Now you can run it after the request is done. There is one other way, but it is pretty ugly. You can run a timeout function, until you get some data in the array

Resources