I looked at other questions regarding this topic but can't wrap my head around how to implement it in this case.
What I am trying to achieve:
Visit site and get content (body)
Visit matching test site and get content (body)
Compare content
Crawl links on page1
Crawl links on page2
Continue
The problem I am having at the moment is that I cannot compare the content because the requests are not waiting for each other.
Here's what my code looks like at the moment.
require('colors');
var request = require('request');
var cheerio = require('cheerio');
var jsdiff = require('diff');
var URL = require('url-parse');
var PROD_START_URL = "https://www.somesite.org";
var MAX_PAGES_TO_VISIT = 100;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var globalProdContent;
var globalTestContent;
var url = new URL(PROD_START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(PROD_START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
globalProdContent = $("#wrapper").text();
// Build new URL for test site
var testURL = url.replace("https://www.somesite.org", "http://matching.testsite");
// Scrape test site
scrapeTestContent(testURL);
collectInternalLinks($);
callback();
});
}
function collectInternalLinks($) {
var relativeLinks = [];
relativeLinks = $("a[href]");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + "/" + $(this).attr('href'));
});
}
function scrapeTestContent(testURL) {
console.log("Visiting matching testpage " + testURL);
request(testURL, function(error, response, body) {
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
var $ = cheerio.load(body);
globalTestContent = $("#wrapper").text();
console.log(globalTestContent);
});
}
Is there an easier way to do this or am I completely off the track?
This can be done in two ways:
1. Add callback to scrapeTestContent
function scrapeTestContent(testURL, cb) {
...
request(testURL, function(error, response, body) {
cb();
});
In visitPage,
function visitPage(url, callback) {
...
scrapeTestContent(testURL, () => collectInternalLinks($));
}
Using ES6 promises. In scrapeTestContent() return new Promise((resolve, reject) => {}. Then in visitPage, use following construct: scrapeTestContent(testUrl).then(() => collectInternalLinks($))
Related
I'm a beginner of nodejs, async bothers me.
I want my code run sequencely or it will breaks.
I have a for loop, and it simply doesn't work...
Here are all the codes:
const util = require('util');
const request = require('request');
const cheerio = require('cheerio');
var host = "http://www.nicotv.me";
var url = "http://www.nicotv.me/video/play/57838-1-%s.html";
var len = 99;
var tab = /-(\d)-/.exec(url);
tab = tab[1] // '1' not '-1-'
function getLen(url) {
//you can ignore this function, it gives len=2
request(url, function (err, response, html) {
if (err) {
console.log('url:', url);
console.log('error:', err);
console.log('statusCode:', response && response.statusCode);
}
else{
var $ = cheerio.load(html);
var cls = '.ff-playurl-dropdown-%s';
$(util.format(cls, tab)).filter(function (){
var data = $(this);
len = data.html().match(/<a href=/g).length;
console.log("episode:", len);
});
getLink(len, function(){
});
}
});
}
getLen(util.format(url, 1)); //len = 2
var getLink = function(lengths, callback){
for (let i = 1; i <= lengths; i++) {
var tmp = util.format(url, i);
try {
request(tmp, function (err, res, html){
console.log('url:', tmp);
if(err){
console.log("error:", err);
console.log("statusCode:", res && res.statusCode);
}else{
var reg = /src="(\/player.php?.{1,})"/;
var result = reg.exec(html);
console.log(result[1]);
}
});
callback();
} catch (error) {
console.log(error);
break;
}
}
}
here is my output:
episode: 2
url: http://www.nicotv.me/video/play/57838-1-2.html
/player.php?u=aHR0cDovL3R5angyLmtpbmdzbnVnLmNuLzM2MHl1bi0xNS5waHA/dmlkPTE1NzkxMzU2MzEyNDAwNTQ5&p=360biaofan&c=0&j=aHR0cDovL2ppZXhpLmtpbmdzbnVnLmNuLzM2MGJpYW9mYW4ucGhwP3VybD0=&x=10&y=&z=
url: http://www.nicotv.me/video/play/57838-1-2.html
/player.php?u=aHR0cDovL3R5angyLmtpbmdzbnVnLmNuLzM2MHl1bi0xNS5waHA/dmlkPTE1Nzg1MDQyMDYyNDAwNTgx&p=360biaofan&c=0&j=aHR0cDovL2ppZXhpLmtpbmdzbnVnLmNuLzM2MGJpYW9mYW4ucGhwP3VybD0=&x=10&y=&z=aHR0cDovL3R5angyLmtpbmdzbnVnLmNuLzM2MHl1bi0xNS5waHA/dmlkPTE1NzkxMzU2MzEyNDAwNTQ5
First problem is these two /player*** link are from 57838-1-1.html
And one of them are not complete.
Second problem is the url output shows 57838-1-2.html twice.
Thanks for your kindly help.
Yesterday had the same problem, so I solved with:
Using request-promise
Replace the loop method arrTitles.Each with for (const jt of arrTitles)
Here a sample:
const request = require('request-promise');
const cheerio = require('cheerio');
var getUrlData =
async function (url) {
console.log(url);
try {
return await request.get(url);
}
catch (err) {
console.error(`${err}: ${url}`);
}
return;
};
var run =
async function (pageUrl) {
var arrData =
await fn.getUrlData(pageUrl)
.then(response => readTable(response));
console.log(arrData);
};
var readTable =
function (document) {
var $;
let arrData = [];
try {
$ = cheerio.load(document);
$('table tr')
.each(
function (trN) {
$(this)
.children('td')
.each(
function (tdN) {
arrData.push($(this).text().trim());
}
)
});
}
catch { }
return arrData;
};
run();
Its probably a stupid mistake, but its taking me too long to find the answer
When i run this simple program (node index.js) i get an error:
SyntaxError: missing ) after argument list (line 55)
You can see sublimelinter found nothing either
I've tried to delete and install all packages again
I've triple checked everything
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var readLine = require('readline');
var URL_SEED= "https://g1.globo.com/";
var MAX_PAGES_TO_VISIT = 100;
var pagesVisited = []; //array, {objeto}
var numPagesVisited = 0;
var pagesToVisit = [];
var allAbsoluteLinks = [];
var url = new URL(URL_SEED); //pesq
var urlBase = url.protocol + "//" + url.hostname; //pesq
pagesToVisit.push(URL_SEED);
crawl();
//pegar href vs a
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Limites de páginas que posso visitar atingido (100)");
return;
}
var nextPage = pagesToVisit.pop(); //peq
if (nextPage in pagesVisited) { //peq
// pagina ja visitada
crawl();
} else {
// pagina nao visitada
visitaPagina(nextPage, crawl);
}
}
function visitaPagina(url, callback) {
// Add page to our set
pagesVisiteu[url] = true;
numPagesVisited++;
console.log("Visitando a página " + url);
// Faz requisicao
request(url, function(error, response, body) {
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
} else {
// Parse do body
var $ = cheerio.load(body);
coletaLinks($);
}
}
}
function coletaLinks($) {
//var linksRelativos = $("a\[href^='/'\]"); //NAO TA PEGANDO DO JEITO QUE
var linksRelativos = $("a\[href^='/'\]");
console.log("Achei " + linksRelativos.length + " links relativos nessa página");
linksRelativos.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
console.log("\n\nPAGES TO VISIT = " + pagesToVisit + "\n\n");
var key = [];
for (key in linksRelativos) {
if(linksRelativos.hasOwnProperty(key)) { //realmente preciso ver essa property?
console.log(key, linksRelativos\[key\].attribs.href);
}
}
fs.writeFile('relativos.txt', linksRelativos, function(err) {
if(err) {
return console.log(err);
//throw err;
}
});
}
I'm not sure why your linter isn't complaining, but you are missing a closing bracket in the request call inside visitaPagina. Second line from the bottom:
function visitaPagina(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
console.log("Visitando a página " + url);
// Faz requisicao
request(url, function(error, response, body) {
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
} else {
// Parse do body
var $ = cheerio.load(body);
coletaLinks($);
}
} // should be })
}
Also, something went wrong with copying the code on SO, you might want to check what's up with that for any future submissions. In this case the screenshots helped, but you should also know that images of code are generally frowned upon on SO.
Good luck!
I'm trying to scrape a web page, place all the URLS in an array and then scrape the next page in the array. But it's just looping the firs URL rather than following the next URL in the array. How do I change it so it scrapes each page?
Thanks for your help.
var request = require('request');
var cheerio = require('cheerio');
var async = require('async');
var fs = require('fs');
var i = 0;
var array = [];
var q = async.queue(function (task, done) {
request(task.url, function(err, res, body) {
if (err) return done(err);
if (res.statusCode != 200) return done(res.statusCode);
var $ = cheerio.load(body);
links = $('a');
$(links).each(function(i, link){
var href = $(link).attr('href');
array.push(href);
console.log(array);
});
done();
i++
q.push({ url: array[i] });
});
}, 5);
q.push({ url: 'http://www.hobo-web.co.uk/' });
It appears done() returns the function, which means the i++ never happens. Move the i++ and q.push({ url: array[i] }); to the next code block. That should solve your problem.
I have this code and I'm not able to create a new array for further use:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
});
for (var i=0; i<2; i++){
console.log(pag[i]);
}
When I run the code it is listing undefined. But if I put the code like this:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
for (var i=0; i<2; i++){
console.log(pag[i]);
}
});
Then it is displaying correct result but still undefined when i'd like to use it later.
Can someone help me up with this.
Node.js is async, that means the scrape hasn't finished yet when you go to print out the array.
I'm not totally sure what your end goal is, but here is a way to do what you are trying with minimal changes:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
var scrape = function( callback ) {
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
if (callback) callback()
});
}
scrape(function() {
for (var i=0; i<2; i++){
console.log(pag[i]);}
})
Catalyst is right, the problem is that you are not waiting for the async request call to complete. Here is my solution:
function getLinks(callback){
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
callback(new Error('upload failed:', error),null);
}
var pag = [];
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag.push(sop); //aici pun val gasite in locuri in array
});
callback(null, pag);
});
}
getLinks(function(err,links){
if(err) return console.log(err);
console.log(links.join(','));
})
here I am defining a functions that makes the request call and it accepts a callback in the standard node callback convention on putting the error message as the first parameter and the results as the second parameter. Then calling that method with a callback that will print the results.
I have the code below and am trying to access the all_records array once the _.each function has completed. However as it is asynchronous I was wondering if was possible to force a callback onto the underscores each?
var request = require('request'),
cheerio = require('cheerio'),
_ = require('underscore');
var all_records = [];
_.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
});
});
});
// Need to run this once _.each has completed final iteration.
console.log(all_records);
Here is a simple solution using a simple synchronization method:
var count = 101;//there are 101 numbers between 0 and 100 including 0 and 100
_.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
count--;
if(count===0){//101 iterations done
console.log(all_records);
}
});
});
});
A more elegant solution can be accomplied by using async's .parallel method.
var requests = []; //an array for all the requests we will be making
for(var i=0;i<=100;i++){
requests.push((function(done){ //create all the requests
//here you put the code for a single request.
//After the push to all_records you make a single done() call
//to let async know the function completed
}).bind(null,i));//the bind is that so each function gets its own value of i
}
async.parallel(requests,function(){
console.log(all_records);
});
async.each ended up being the easiest to implement.
async.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
});
});
}, function(err){
console.log(all_records);
});