I saw many web scraping tutorial but I can't find the pattern that would scrape the web with subpages.
Here is the sequence
Scrape the first page to find several URLs
Go to each URLs, find several URLs
Go to another layer of URLs, read the content from the table
I can find many URL that teaches how to do the step 1. But further down I can't find any good example. In addition I tried X-ray but it doesn't work well because my URL is part of the parent item.
Here is some sample code:
var request = require('request');
var cheerio = require('cheerio');
var url = 'https://news.ycombinator.com';
request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('span.comhead').each(function(i, element){
// Obtain the URL of the news
var a = $(this).prev();
var subUrl = a.attr('href');
// Go to that news and obtain the title
request(subUrl, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var title = $("title").text();
console.log(title);
}
});
});
}
});
But the cheerios each should be running in sequential way. Are there any way that I can scrape the page in parallel?
Thanks for the help
You can do this easily with x-ray. See below code for example:
var Xray = require('x-ray');
var x = Xray();
var baseUrl = 'https://news.ycombinator.com'; // set base url
x(baseUrl, { // scrape base url
title1: x('a', [{links1:'#href'}]) // store links in array
})(function(err, obj1) { // pass array to next fx
obj1.forEach(function(links.link) {
// assuming links.link stores '/sample-link-to-crawl-83792',
x(baseUrl+links.link, { // append base url to link and crawl
title2: x('a', [{links2:'#href'}])
})(function(err, obj2){
obj2.forEach(function(links2.link) { // for each link in obj2
console.log(link) // should print link to console
});
});
});
});
You can continue like this or simply create a function that returns a promise and pass the scanned url's to it at anytime. Then you watch for the completed promise and do what you want with the returned data.
Related
I am trying to scrape three levels of a webpage that link to each other, e.g. Home -> Jobs -> Open Positions. I then want to write the scraped data into a output.json file. The scraping works just fine, but the writing of the file is finished before the requests are due to their asynchronous nature.
The code below, using normal requests scrapes all the data, but is too "late" and thus the info does not get written into the file.
request(url, function(error, response, html){
var $ = cheerio.load(html);
$("tr").each(function(i, elem){
var club_url = $(this).children().first().children().attr("href");
club_url = url.substring(0,25) + club_url;
request(club_url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var club_name = $("h1.masthead-title").first().text().trim();
console.log(club_name);
clubs[i] = club_name;
var teams = {};
$("tr").each(function(i,elem){
var team_url = $(this).children().first().children().attr("href");
team_url = url.substring(0,25) + team_url;
request(team_url, function(error,response,html){
if(!error){
var $ = cheerio.load(html);
var team = $(".team-name").text().trim();
console.log(team);
teams[i] = team;
}
});
});
}
});
});
fs.writeFile('output.json', JSON.stringify(clubs, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
});
Therefore I tried to use request-promise and re-write the code with it, so the writing would executed after the request promises are resolved.
app.get('/scrape', function(req, res){
var clubs = {};
url = 'https://norcalpremier.com/clubs/';
var options = {
uri: 'https://norcalpremier.com/clubs/',
transform: function (body) {
return cheerio.load(body);
}
};
rp(options).then(($) => {
var ps = [];
$("tbody tr").each(function(i, elem){
var club_url = $(this).children().first().children().attr("href");
club_url = url.substring(0,25) + club_url;
console.log(club_url);
var club_options = {
uri: club_url,
transform: function (body) {
return cheerio.load(body);
}
};
ps.push(rp(club_options));
});
Promise.all(ps).then((results) =>{
results.forEach((club)=>{
var $ = cheerio.load(club);
var club_name = $("h1.masthead-title").first().text().trim();
console.log(club_name);
clubs[i] = club_name;
})
}).then(()=>{
fs.writeFile('output.json', JSON.stringify(clubs, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
});
res.send('Scraping is done, check the output.json file!');
}).catch(err => console.log(err));
})
})
However, I just don't get it to work and get a bad gateway error, funnily after the console logs that the file was written. Some I assume neither the scraping is working now nor waiting for the requests to be finished.
Note: the third request is cut in this version, because I need to get the second level running first.
What I want to achieve is to get information from each of the sites on level 2 and 3, basically the name, put it into a JSON object and then write this into a file. As said previously, the scraping of the relevant data on level 2 and 3 worked in the former version, but not the writing to the file.
Thanks, your help is so much appreciated!
Here's what I would do, make the function async and then do:
url = 'https://norcalpremier.com/clubs/'
// put the request code in a function so we don't repeat it
let $ = await get(url)
// get the club urls
let club_urls = $('td:nth-child(1) a[href*="/club/"]').map((i, a) => new URL($(a).attr('href'), url).href).get()
// await the responses. I used slice because I think this much concurrency will cause problems
let resolved = await Promise.all(club_urls.slice(0,2).map(club_url => get(club_url)))
// get the club names
let club_names = resolved.map($ => $("h1.masthead-title").first().text().trim())
// write the file, I think synchronously is a good idea here.
fs.writeFileSync('output.json', JSON.stringify(club_names))
I'll let you figure out the get function since I don't like to use request-promise
I've written a script in node using two different functions getPosts() and getContent() supplying callback within them in order to print the result calling a standalone function getResult(). The selectors defined within my script is flawless.
However, when I execute my script, It prints nothing. It doesn't throw any error either. I tried to mimic the logic provied by Neil in this post.
How can I make it a go?
I've written so far:
var request = require('request');
var cheerio = require('cheerio');
const url = 'https://stackoverflow.com/questions/tagged/web-scraping';
function getPosts(callback){
request(url, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
$('.summary .question-hyperlink').each(function(){
var items = $(this).text();
var links = $(this).attr("href");
callback(items,links);
});
}
});
}
function getContent(item,link,callback){
request(link, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
var proLink = $('.user-details > a').eq(0).attr("href");
callback({item,link,proLink});
}
});
}
function getResult() {
getPosts(function(item,link) {
getContent(item,link,function(output){
console.log(output);
});
});
}
getResult();
The link value that you receive from getPosts is a relative link which means that the request fails. You can extract the hostname inside its own variable and create the full URL from the hostname + the relative link.
const host = 'https://stackoverflow.com';
const url = '/questions/tagged/web-scraping';
// ...
function getContent(item,link,callback){
// Here we use the absolute URL
request(host + link, function (error,response, html) {
if (!error && response.statusCode == 200){
var $ = cheerio.load(html);
var proLink = $('.user-details > a').eq(0).attr("href");
callback({item,link,proLink});
}
});
}
I wanted to use a crawler in node.js to crawl all the links in a website (internal links) and get the title of each page , i saw this plugin on npm crawler, if i check the docs there is the following example:
var Crawler = require("crawler");
var c = new Crawler({
maxConnections : 10,
// This will be called for each crawled page
callback : function (error, res, done) {
if(error){
console.log(error);
}else{
var $ = res.$;
// $ is Cheerio by default
//a lean implementation of core jQuery designed specifically for the server
console.log($("title").text());
}
done();
}
});
// Queue just one URL, with default callback
c.queue('http://balenol.com');
But what i really want is to crawl all the internal urls in the site and is the inbuilt into this plugin or does this need to be written seperately ? i don't see any option in the plugin to visit all the links in a site , is this possible ?
The following snippet crawls all URLs in every URL it finds.
const Crawler = require("crawler");
let obselete = []; // Array of what was crawled already
let c = new Crawler();
function crawlAllUrls(url) {
console.log(`Crawling ${url}`);
c.queue({
uri: url,
callback: function (err, res, done) {
if (err) throw err;
let $ = res.$;
try {
let urls = $("a");
Object.keys(urls).forEach((item) => {
if (urls[item].type === 'tag') {
let href = urls[item].attribs.href;
if (href && !obselete.includes(href)) {
href = href.trim();
obselete.push(href);
// Slow down the
setTimeout(function() {
href.startsWith('http') ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`) // The latter might need extra code to test if its the same site and it is a full domain with no URI
}, 5000)
}
}
});
} catch (e) {
console.error(`Encountered an error crawling ${url}. Aborting crawl.`);
done()
}
done();
}
})
}
crawlAllUrls('https://github.com/evyatarmeged/');
In the above code, just change the following to get the internal links of a website...
from
href.startsWith('http') ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`)
to
href.startsWith(url) ? crawlAllUrls(href) : crawlAllUrls(`${url}${href}`)
I have been trying to figure the following for the last couple of days and just can't seem to figure out the answer. I am new to node and JS (only experience is online tutorials).
I am trying to create a class (function) to scrape the source code from websites. I want to read in a url from the command line and return the html content. However, I seem to be getting different results when running the code different ways (which I think I should be getting the same results).
I have been reading about events in node and so I have used them a little in the code. One listener event prompts the me for the url and then after setting the url it (the listener function) emits a message, which is picked up by another listener which goes out and fetches the html content.
The problem I am having is that when I create an instance of the object, it seems like the request portion of the code does not execute. However, if I call the method from the instance I get the print out of the html content of the page.
Any help is appreciated. Thanks.
function test() {
var events = require('events').EventEmitter;
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
console.log("Input the URL: ");
process.stdin.resume();
process.stdin.setEncoding('utf8');
process.stdin.on('data', function (text) {
that.url = util.inspect(text);
that.url = that.url.substr(1, that.url.length - 4);
that.eventEmitter.emit('Get url html');
process.exit();
});
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body) // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
scrapper.httpGet(); // This gives the desired results from that.httpGet
I solved using the Prompt library https://www.npmjs.com/package/prompt
function test() {
var events = require('events').EventEmitter;
var prompt = require('prompt');
var request = require('request');
var util = require('util');
var that = this;
that.eventEmitter = new events();
that.url = 'http://www.imdb.com/';
that.eventEmitter.on('setURL',that.setUrl = function(){
prompt.start();
process.stdin.setEncoding('utf8');
prompt.get(['url'], function( err, result ) {
that.url = result.url;
that.eventEmitter.emit('Get url html');
} );
});
that.eventEmitter.on('Get url html',that.httpGet = function() {
console.log("Fetching... " + that.url);
request(that.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(body); // Show the HTML for the Google homepage.
} else {
console.log("Error Encountered");
}
});
});
that.eventEmitter.emit('setURL');
}
var scrapper = new test(); //This asks me for the url and then only executes to first line of that.httpGet.
// scrapper.httpGet(); // This gives the desired results from that.httpGet
I ran the script from the commandline, input http://www.google.com and it retrieved the results without the additional call to scrapper.httpGet();
I have the below code and it is working fine to get:
<troveUrl>http://trove.nla.gov.au/work/23043869</troveUrl>
But I would like to get the value after 'id' in the following from the same page and cannot get it!
<work id="23043869" url="/work/23043869">
here is the code that i currently have
var request = require ('request'),
cheerio = require ('cheerio');
request('http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&q-year1-date=2000&l-advformat=Thesis&l-australian=y&q-term2=&q-term3=&q-term0=&q-field1=title%3A&q-type2=all&q-field0=&q-term1=&q-type3=all&q-field3=subject%3A&q-type0=all&q-field2=creator%3A&q-type1=all&l-availability=y%2Ff&q=+date%3A[2000+TO+2014]&q-year2-date=2014&n=1', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('troveurl').each(function(i, element){
var id = $(this);
console.log(id.text());
});
}
});
Any assistance appreciated.
You should pass xmlMode: true in the options object, then you can parse it as XML.
You can then grab the tag and data with $('tag').attr('attribute') and $('tag').text() to get the data between the tags as you've done.
var request = require('request'),
cheerio = require('cheerio');
request('http://api.trove.nla.gov.au/result?key=6k6oagt6ott4ohno&zone=book&q-year1-date=2000&l-advformat=Thesis&l-australian=y&q-term2=&q-term3=&q-term0=&q-field1=title%3A&q-type2=all&q-field0=&q-term1=&q-type3=all&q-field3=subject%3A&q-type0=all&q-field2=creator%3A&q-type1=all&l-availability=y%2Ff&q=+date%3A[2000+TO+2014]&q-year2-date=2014&n=1', function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html, {
xmlMode: true
});
console.log($('work').attr('id'))
}
});
The real issue lies in the syntax you used to get the value after 'id'. The following code will not console.log the id out.
var id = $(this);
console.log(id.text());
The correct syntax should be $('your element').attr('id') like is mentioned in Ben Fortune's answer above. However, passing xmlMode: true in the options object is not a necessity.
Passing xmlMode: false in the options will also work if you are using the correct syntax.