Stuck with looping in Request + Cheerio (NODEJS) - node.js

I'm trying to scrape a web page, place all the URLS in an array and then scrape the next page in the array. But it's just looping the firs URL rather than following the next URL in the array. How do I change it so it scrapes each page?
Thanks for your help.
var request = require('request');
var cheerio = require('cheerio');
var async = require('async');
var fs = require('fs');
var i = 0;
var array = [];
var q = async.queue(function (task, done) {
request(task.url, function(err, res, body) {
if (err) return done(err);
if (res.statusCode != 200) return done(res.statusCode);
var $ = cheerio.load(body);
links = $('a');
$(links).each(function(i, link){
var href = $(link).attr('href');
array.push(href);
console.log(array);
});
done();
i++
q.push({ url: array[i] });
});
}, 5);
q.push({ url: 'http://www.hobo-web.co.uk/' });

It appears done() returns the function, which means the i++ never happens. Move the i++ and q.push({ url: array[i] }); to the next code block. That should solve your problem.

Related

Nodejs: Comparing results of two async requests

I looked at other questions regarding this topic but can't wrap my head around how to implement it in this case.
What I am trying to achieve:
Visit site and get content (body)
Visit matching test site and get content (body)
Compare content
Crawl links on page1
Crawl links on page2
Continue
The problem I am having at the moment is that I cannot compare the content because the requests are not waiting for each other.
Here's what my code looks like at the moment.
require('colors');
var request = require('request');
var cheerio = require('cheerio');
var jsdiff = require('diff');
var URL = require('url-parse');
var PROD_START_URL = "https://www.somesite.org";
var MAX_PAGES_TO_VISIT = 100;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var globalProdContent;
var globalTestContent;
var url = new URL(PROD_START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(PROD_START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
globalProdContent = $("#wrapper").text();
// Build new URL for test site
var testURL = url.replace("https://www.somesite.org", "http://matching.testsite");
// Scrape test site
scrapeTestContent(testURL);
collectInternalLinks($);
callback();
});
}
function collectInternalLinks($) {
var relativeLinks = [];
relativeLinks = $("a[href]");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + "/" + $(this).attr('href'));
});
}
function scrapeTestContent(testURL) {
console.log("Visiting matching testpage " + testURL);
request(testURL, function(error, response, body) {
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
var $ = cheerio.load(body);
globalTestContent = $("#wrapper").text();
console.log(globalTestContent);
});
}
Is there an easier way to do this or am I completely off the track?
This can be done in two ways:
1. Add callback to scrapeTestContent
function scrapeTestContent(testURL, cb) {
...
request(testURL, function(error, response, body) {
cb();
});
In visitPage,
function visitPage(url, callback) {
...
scrapeTestContent(testURL, () => collectInternalLinks($));
}
Using ES6 promises. In scrapeTestContent() return new Promise((resolve, reject) => {}. Then in visitPage, use following construct: scrapeTestContent(testUrl).then(() => collectInternalLinks($))

Write array object to JSON in node.js

I am trying to write some items I pushed into an array into a JSON file in node.js but I can't figure out how to wait for the array to contain the items before writing the JSON file. As a result the file is always empty. Do i need to have a callback? If so, how? NB:I'm still new to node.js
This is the code below:
var getLinks = require('./news_archive/news_links.js');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var saveNews = './news_archive/news.json';
var jsonObj = [];
var i;
var number_of_links = getLinks.links.length;
for(i=0; i<number_of_links; i++){
//GET ARTICLE LINK FROM link.js
var url = "http://www.times.co.sz/"+getLinks.links[i];
request(url, function(err, resp, body){
var $ = cheerio.load(body);
//GET ARTICLE HEADLINE
var storyHeadline = $('#article_holder h1');
var storyHeadlineText = storyHeadline.text();
//GET DATE POSTED
var datePosted = $('.metadata_time');
var datePostedText = datePosted.text();
//GET ARTICLE REPORTER'S NAME
var reporterName = $('.article_metadata a');
var reporterNameText = reporterName.text();
//GET ARTICLE SUMMARY
var fullStory = $('#article_body span');
var fullStoryText = fullStory.text();
//PUSH ITEMS TO jsonObj ARRAY
jsonObj.push({
id: i,
storyHeadline: storyHeadlineText,
datePosted: datePostedText,
reporterName: reporterNameText,
fullStory: fullStoryText
})
});
} //END for LOOP
//WRITE TO news.json file
fs.writeFile(saveNews, JSON.stringify(jsonObj, null, 4), function(err) {
if(err) {
console.log(err);
} else {
console.log("JSON saved to " + saveNews);
}
});
The issue is that request is asyncronous and you cannot use syncronous loop to iterate through. You can use async lib for that
var getLinks = require('./news_archive/news_links.js');
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var saveNews = './news_archive/news.json';
var number_of_links = getLinks.links.length;
var async = require('async');
async.times(number_of_links, function (i, next) {
var url = "http://www.times.co.sz/"+getLinks.links[i];
request(url, function(err, resp, body){
var $ = cheerio.load(body);
//GET ARTICLE HEADLINE
var storyHeadline = $('#article_holder h1');
var storyHeadlineText = storyHeadline.text();
//GET DATE POSTED
var datePosted = $('.metadata_time');
var datePostedText = datePosted.text();
//GET ARTICLE REPORTER'S NAME
var reporterName = $('.article_metadata a');
var reporterNameText = reporterName.text();
//GET ARTICLE SUMMARY
var fullStory = $('#article_body span');
var fullStoryText = fullStory.text();
//PUSH ITEMS TO jsonObj ARRAY
next(err, {
id: i,
storyHeadline: storyHeadlineText,
datePosted: datePostedText,
reporterName: reporterNameText,
fullStory: fullStoryText
});
});
}, function (err, res) {
// do not forget to handle error
fs.writeFile(saveNews, JSON.stringify(res, null, 4), function(err) {
if(err) {
console.log(err);
} else {
console.log("JSON saved to " + saveNews);
}
});
})

Need to send response after an action has completed

I am trying to make a web crawler which crawls IMDB and lists the movie name and rating. This is my index.js file.
Suppose i am crawling for 10 movies. I am then saving the crawled results in a different file say 'message.txt'. Now i want to send this message.txt file as a response to any request. But whenever I make a request it always send me an empty file to my browser initially. Then i notice that it takes some time before the crawled results are saved in the message.txt file. I think this is because all actions are asynchronous in nodejs. So is there a way to send the message.txt file only after crawling is complete?
var express = require('express');
var app = express();
var cheerio = require('cheerio');
var request = require('request');
var fs = require('fs');
app.listen(8080);
console.log('Running');
app.get('/', function(req, res) {
console.log('Recieved the get Request');
var i = 1;
var count = 0;
while (count < 10) {
var url = 'http://www.imdb.com/title/tt' + i + '/';
console.log(url);
count = count + 1;
i = i + 1;
request(url, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html);
var title, ratings, released;
var json = {
title: '',
ratings: '',
released: ''
};
$('.title_wrapper').filter(function() {
var data = $(this);
json.title = data.children().first().text().trim();
json.released = data.children().last().children().last().text().trim();
});
$('.ratingValue').filter(function() {
var data = $(this);
json.ratings = parseFloat(data.text().trim());
});
console.log(json);
fs.appendFile('message.txt', JSON.stringify(json, null, 4) + '\n', function(err) {});
};
});
};
res.sendFile(__dirname + '/index.js');
});
You can use the async package which is great for controlling flow, something like:
console.log('Recieved the get Request');
var i = 1;
var count = 0;
while (count < 10) {
var url = 'http://www.imdb.com/title/tt' + i + '/';
console.log(url);
count = count + 1;
i = i + 1;
async.waterfall([
function sendRequest (callback) {
if (!error) {
var $ = cheero.load(html);
var json = {
title: '',
ratings: '',
released: ''
}
}
$('.title_wrapper').filter(function() {
var data = $(this);
json.title = data.children().first().text().trim();
json.released = data.children().last().children().last().text().trim();
});
$('.ratingValue').filter(function() {
var data = $(this);
json.ratings = parseFloat(data.text().trim());
});
callback(null, JSON.stringify(json, null, 4) + '\n');
},
function appendFile (json, callback) {
fs.appendFile('message.txt', json, function(err) {
if (err) { callback(err); }
callback();
});
}
], function(err) {
res.sendFile(__dirname + '/index.js');
});
fs.appendFile('message.txt', JSON.stringify(json, null, 4) + '\n', function(err) {
//This part is executed after the process has been completed
});
You have to make a callback there as that part will be only called when your operation has been performed.
We are utilizing the callback feature here although there isn't any concrete callback except the err in our case, we don't need any other badly though.
Please try.
fs.appendFile() is asynchronous so the stuff you append to the file won't be there right away when the function returns. So if you want to read send that file to the user, you'll need to do it inside the callback you supply to fs.appendFile().
app.get('/', function(req, res) {
...
fs.appendFile(
'message.txt',
JSON.stringify(json, null, 4) + '\n',
function(err) {
if (err) {
// Log the error and send a message to the user here
return;
}
res.sendFile(__dirname + '/index.js')
}
);
};
});
};
});
You may be tempted to use fs.appendFileSync() instead. That would be fine for a command line tool, but since this is a web server, do not do that. It will lock up the thread while the I/O happens.

Save array from cheerio in node.js

I have this code and I'm not able to create a new array for further use:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
});
for (var i=0; i<2; i++){
console.log(pag[i]);
}
When I run the code it is listing undefined. But if I put the code like this:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
for (var i=0; i<2; i++){
console.log(pag[i]);
}
});
Then it is displaying correct result but still undefined when i'd like to use it later.
Can someone help me up with this.
Node.js is async, that means the scrape hasn't finished yet when you go to print out the array.
I'm not totally sure what your end goal is, but here is a way to do what you are trying with minimal changes:
var request = require("request");
var cheerio = require("cheerio");
var pag = [];
var scrape = function( callback ) {
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
return console.error('upload failed:', error);
}
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag[i] = sop; //aici pun val gasite in locuri in array
});
pag.push(', ');
if (callback) callback()
});
}
scrape(function() {
for (var i=0; i<2; i++){
console.log(pag[i]);}
})
Catalyst is right, the problem is that you are not waiting for the async request call to complete. Here is my solution:
function getLinks(callback){
request('http://www.tastez.ro/tv.php?query=sopcast', function(error, response, body) {
if (error) {
callback(new Error('upload failed:', error),null);
}
var pag = [];
var $ = cheerio.load(body);
links = $(".page a"); //use your CSS selector here
$(links).each(function(i, link){
var sop = $(this).attr('href');
pag.push(sop); //aici pun val gasite in locuri in array
});
callback(null, pag);
});
}
getLinks(function(err,links){
if(err) return console.log(err);
console.log(links.join(','));
})
here I am defining a functions that makes the request call and it accepts a callback in the standard node callback convention on putting the error message as the first parameter and the results as the second parameter. Then calling that method with a callback that will print the results.

Dealing with asynchronous functions. Custom callback?

I have the code below and am trying to access the all_records array once the _.each function has completed. However as it is asynchronous I was wondering if was possible to force a callback onto the underscores each?
var request = require('request'),
cheerio = require('cheerio'),
_ = require('underscore');
var all_records = [];
_.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
});
});
});
// Need to run this once _.each has completed final iteration.
console.log(all_records);
Here is a simple solution using a simple synchronization method:
var count = 101;//there are 101 numbers between 0 and 100 including 0 and 100
_.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
count--;
if(count===0){//101 iterations done
console.log(all_records);
}
});
});
});
A more elegant solution can be accomplied by using async's .parallel method.
var requests = []; //an array for all the requests we will be making
for(var i=0;i<=100;i++){
requests.push((function(done){ //create all the requests
//here you put the code for a single request.
//After the push to all_records you make a single done() call
//to let async know the function completed
}).bind(null,i));//the bind is that so each function gets its own value of i
}
async.parallel(requests,function(){
console.log(all_records);
});
async.each ended up being the easiest to implement.
async.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
});
});
}, function(err){
console.log(all_records);
});

Resources