How to concurrent download files using cheerio and nodejs? - node.js

I have a website with multiple pages, each page lists download links which I want to scrap and download.
I have few issues with it:
My script only downloads about 4-5 files and getting stuck.
I would like to concurrently download as much files as my CPU can.
I got stuck with maximum event emitters, I don't understand why is that so I just go
How to follow redirects purely using request module (without follow-redirects)?
How to download the file like the browser does without mentioning it's name? there is no content-disposition but I think the browser follow redirects and the redirected URL has the filename in it's path.
My current code looks like so:
var request = require('request');
var cheerio = require('cheerio');
var https = require('follow-redirects').https;
require('events').EventEmitter.prototype._maxListeners = 1000;
for(var i = 1; i <= 10000; i++) {
(function(i){
url = 'http://mywebsite.com/files?page=' + i;
request(url, gotHTML)
})(i);
}
function gotHTML(err, resp, html) {
var $ = cheerio.load(html);
$('.file-header').each(function() {
var data = $(this);
var fileLink = data.children().first().children().first().attr('href');
var fileName = fileLink.substring(10);
var downloadLink = 'https://mywebsite.com/api/download/' + fileName;
download(downloadLink, function() {
console.log('downloaded');
})
})
}
function download(url, cb) {
var request = https.get(url, function(response) {
var location = request.res.headers.location;
console.log(location);
location = location.split('/').pop();
console.log(location);
var file = fs.createWriteStream(location);
response.pipe(file);
file.on('finish', function() {
file.close(cb);
});
});
}

The default HTTP/HTTPS Agent only uses a maximum of 5 sockets (maxSockets) for requests to the same origin. So this could be causing some issues for you.
Try changing this:
var request = https.get(url, function(response) {
to this:
var options = require('url').parse(url);
options.agent = false; // or use a custom https.Agent with a higher `maxSockets`
var request = https.get(options, function(response) {

Related

How to create a Video URL Blob in NodeJS?

Can you please help me in create a Video URL Blob in NodeJS?
var xhr = new XMLHttpRequest();
xhr.responseType = 'blob';
xhr.onload = function() {
var reader = new FileReader();
reader.onloadend = function() {
var byteCharacters = atob(reader.result.slice(reader.result.indexOf(',') + 1));
var byteNumbers = new Array(byteCharacters.length);
for (var i = 0; i < byteCharacters.length; i++) {
byteNumbers[i] = byteCharacters.charCodeAt(i);
}
var byteArray = new Uint8Array(byteNumbers);
var blob = new Blob([ byteArray ], { type: 'video/ogg' });
var url = URL.createObjectURL(blob);
console.log(url);
};
reader.readAsDataURL(xhr.response);
};
xhr.open(
'GET',
'...Video URL...'
);
xhr.send();
Error Output:
throw new Error('cannot read as File: ' + JSON.stringify(file));
Error: cannot read as File: undefined
I have used the packages XMLHttpRequest, URL, FileReader and Blob
Please Help, Thanks
You can Use Azure Cloud for storing the videos and then us Azure BLOB
Node.JS does not have XMLHttpRequest, FileReader, Blob, etc. Instead, it uses the fs, http modules and other built-in classes, like Buffer
Trying to use these browser features on the server, even if using packages that try to emulate them, may not always work. In fact, many of those libraries which emulate those features do not include everything. For example, the xmlhttprequest node package does not use .response, but only the text response.
In Node.JS, there is no need for blob URLs (where would you even use that?), you just interact with the Buffers directly.
// making a http request to fetch something
const http = require("https");
const request = https.request({
method: "GET",
url: "https://example.com/video.ogg"
}, (res) => {
const chunks = [];
res.on("data", (chunk) => {
chunks.push(chunk);
});
res.on("end", () => {
// raw byte data
const buffer = Buffer.concat(chunks);
/*
* Interact with Buffer here
*/
});
});
request.end();

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

scraping a page that redirects

i try to scrape a simple page (require cheerio and request):
https://www.ishares.com/uk/individual/en/products/251824/
The code fails. I believe it is because, in order to get to the above, users are prompted on previous page for "individual" or "institutional" so are being redirected.
I have tried different variations of the url, but all fail.
how can i get the raw HTML using node.js ?
here is the code:
var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio'); // fast flexible implement of jQuery for server.
var fs = require('fs');
var app = express();
var port = 8000;
var timeLog = []; // for dl to measure the time of events.
// var startTime = Date.now();
timeLog[0] = Date.now();
console.log('program initiated at time: '+new Date());
// example 1: pull the webpage and print to console
var url ="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf";
url = "https://www.ishares.com/uk/individual/en/products/251824/";
url="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
request(url,function functionName(err,resp,body) {
var $ = cheerio.load(body);
var distYield = $('.col-distYield');
var distYieldText = distYield.text();
console.log('we got to line 24');
console.log(distYieldText);
timeLog[2] = Date.now();
console.log('data capture time: '+(timeLog[2] - timeLog[0])/1000+' seconds');
if (err) {
console.log(err);
}else {
//console.log(body);
console.log('the body was written: success');
}
});
// example 2: download webpage and save file
var destination = fs.createWriteStream('./downloads/iSharesSEMB.html');
request(url)
.pipe(destination);
// example 3:
var destination = fs.createWriteStream('./downloads/iSharesSEMB2.html');
request(url)
.pipe(destination)
.on("finish",function () {
console.log('done');
})
.on('error',function (err) {
console.log(err);
});
timeLog[1] = Date.now();
console.log('program completed at time: '+new Date());
console.log('Asynchronous program run time: '+(timeLog[1] - timeLog[0])/1000+' seconds');
Alright, I got it to work. I enabled cookie support for request but then got into a redirect loop. Adding a promise worked it out. Here's only the relevant HTML request part:
const request = require('request'),
cheerio = require('cheerio');
const url = "https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";
options = {
jar: true
}
const getDistYield = url => {
return new Promise((resolve, reject) => {
request(url, options, function(err,resp,body) {
if (err) reject(err);
let $ = cheerio.load(body);
resolve($('.col-distYield'));
})
})
}
getDistYield(url)
.then((tag) => {
console.log(tag.text())
}).catch((e) => {
console.error(e)
})
Outputs:
Distribution Yield
The distribution yield represents the ratio of distributed income over the last 12 months to the fund’s current NAV.
as of 20-Feb-2018
4.82
Also, notice I've used the last URL you provided.
I hope this works it out for you :)
have amended the resolve part to just get the value (and not the text) which is a nested class.
resolve($('.col-distYield > span:nth-child(2)'));

WebScraping & web navigation simulation

I'm make a webscraper and I already know how to scrap some data and convert them to Json with this code I made :
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var url = 'http://www.footmercato.net/';
request(url, function(err, resp, body) {
if (!err) {
var $ = cheerio.load(body);
var data = [];
var i = 1;
$('.text').each(function(i, element) {
var article = $('p');
var jsObject = { title : "", article : "", date : "" };
var articleTxt = article.text();
jsObject.article = articleTxt;
data.push(jsObject);
})
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
})
}
});
app.listen('8080');
But I would like to navigate to the website I'm scraping, fill out form and going to others pages.
Does somebody know if i can do it with cheerio or how I can add it to my existing code ?
Thanks
You can use webdriverio actually he will open a browser window, and then you can manipulate the dom through the webdriverio api to handle forms mouse clicks, and navigate from one page to an other.
var webdriverio = require('webdriverio');
var options = {
desiredCapabilities: {
browserName: 'firefox'
}
};
webdriverio
.remote(options)
.init()
.url('http://www.google.com')
.getTitle().then(function(title) {
console.log('Title was: ' + title);
})
.end();

How can I pipe external html content in jade?

I'm trying to get some html from a page online and place inside my jade template so I can style without copying and pasting every time a need it.
var request = require("request");
var cheerio = require("cheerio");
var loadContent = function() {
request({
uri: "http://www.mywebsite.com.br/test"
}, function(error, response, body) {
var $ = cheerio.load(body);
var result;
$('.content').each(function(){
result={"content":$(this).html()};
});
placeContent(result);
return true;
});
};
var placeContent = function(content) {
return content;
};
module.exports = loadContent;
Inside my gulpfile.js, besides the right requirements, I have:
gulp.task('jadeBuild', function() {
var options = {
pretty: true
};
return gulp.src(src+'/*.jade')
.pipe(data(function(){
return loadContent();
}))
.pipe(jade(options))
.pipe(gulp.dest(build))
.pipe(connect.reload());
});
And my jade file:
.mycontent
#{content}
What am I missing?
Try changing #{content} to !{content} in your jade file. This will tell jade not to escape any of the characters(which can be dangerous depending on where the input is coming from!).
See http://jade-lang.com/reference/interpolation/
Also, when you loop over each .content you are overwriting result every time. You need to append to result if you want to aggregate all the content together. Something like:
var result = {content: ''};
$('.content').each(function(){
result.content += $(this).html();
});

Resources