Scrape paginate using nodejs, cheerio - node.js

How can I scrape data from a pagination ?
My code is work well with one pages, but I need to scrap all data from page 2, page 3 ... and push into an ebooks array.
Here is my code
function searchEbooks(query) {
return fetch(getUrl(1, query))
.then(res => res.text())
.then(body => {
const ebooks = [];
$('article').each(function(i, element) {
const $element = $(element);
const $title = $element.find('.entry-title a');
const $image = $element.find('.attachment-post-thumbnail');
const $description = $element.find('.entry-summary');
const authors = [];
$(element).find('.entry-author a').each(function(i, element) {
author = $(element).text();
authors.push(author);
});
const ebook = {
image: $image.attr('src'),
title: $title.text(),
description: $description.text(),
authors: authors,
}
ebooks.push(ebook);
});
return ebooks;
});
}
I have no idea how to do this. Please give me a hint or an example.
I use cherrio, node-fetch packages.
Thank you.

Try this to get next url:
var href = $('.current+a').attr('href');
if(href){
// you can check this url
} else {
console.log('You get all page');
}

Related

How to define the end of a function when i rerun the same function?

I am using the following function to crawl entire websites, extracting all links from each page using cheerio.
const crawlUrl = async (urlCrawl) => {
try {
//Check if url was already crawled
if (urlsCrawleadas[urlCrawl]) return;
urlsCrawleadas[urlCrawl] = true;
console.log('Crawling', urlCrawl)
const response = await fetch(urlCrawl)
const html = await response.text()
//Extract links from every page
const $ = cheerio.load(html)
const links = $("a").map((i, link) => link.attribs.href).get()
const { host } = urlParser.parse(urlCrawl)
//Filter links from same website, and external sites
const newLinks = links
.map(link => {
if (!link.includes(host.hostname) && link.includes('http://') || !link.includes(host.hostname) && link.includes('https://')) {
const nueva = urlParser.parse(link)
othersUrls.push(nueva.host.hostname)
}
return link
})
.filter(link => link.includes(host.hostname))
.map(async link => await crawlUrl(link))
//remove duplicates urls from
const test = [...othersUrls.reduce((map, obj) => map.set(obj, obj), new Map()).values()]
//export urls
let writer = FS.createWriteStream('data.txt')
writer.write(test.toString())
return 'End'
} catch (error) {
console.log('function crawler', error)
}
}
The problem is that I cannot define when the crawler has finished going through all the urls. This makes it unable to tell the front when the full task has finished.
Regards

Asynchronous function in Node.js API not working as intended

As an exercise, I'm creating a simple API that allows users to provide a search term to retrieve links to appropriate news articles across a collection of resources. The relevent function and the route handler that uses the function is as follows:
function GetArticles(searchTerm) {
const articles = [];
//Loop through each resource
resources.forEach(async resource => {
const result = await axios.get(resource.address);
const html = result.data;
//Use Cheerio: load the html document and create Cheerio selector/API
const $ = cheerio.load(html);
//Filter html to retrieve appropriate links
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resource.name
}
);
})
})
return articles; //Empty array is returned
}
And the route handler that uses the function:
app.get('/news/:searchTerm', async (req, res) => {
const searchTerm = req.params.searchTerm;
const articles = await GetArticles(searchTerm);
res.json(articles);
})
The problem I'm getting is that the returned "articles" array is empty. However, if I'm not "looping over each resource" as commented in the beginning of GetArticles, but instead perform the main logic on just a single "resource", "articles" is returned with the requested data and is not empty. In other words, if the function is the following:
async function GetArticles(searchTerm) {
const articles = [];
const result = await axios.get(resources[0].address);
const html = result.data;
const $ = cheerio.load(html);
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resources[0].name
}
);
})
return articles; //Populated array
}
Then "articles" is not empty, as intended.
I'm sure this has to do with how I'm dealing with the asynchronous nature of the code. I've tried refreshing my knowledge of asynchronous programming in JS but I still can't quite fix the function. Clearly, the "articles" array is being returned before it's populated, but how?
Could someone please help explain why my GetArticles function works with a single "resource" but not when looping over an array of "resources"?
Try this
function GetArticles(searchTerm) {
return Promise.all(resources.map(resource => axios.get(resource.address))
.then(responses => responses.flatMap(result => {
const html = result.data;
//Use Cheerio: load the html document and create Cheerio selector/API
const $ = cheerio.load(html);
let articles = []
//Filter html to retrieve appropriate links
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resource.name
}
);
})
return articles;
}))
}
The problem in your implementation was here
resources.forEach(async resource...
You have defined your function async but when result.foreach get executed and launch your async functions it doesn't wait.
So your array will always be empty.

fetch an array of urls not recieving all data

I have an array of urls, I want to fetch every element of that array in order, but when I run the code I got the incorrect order, and some elements is missing
const fetch = require('node-fetch');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var chocolateList = [];
urlArray = [...] // 95 elements
urlArray.forEach((url, index) => {
fetch(url).then(res => res.text())
.then(async (success) => {
const dom = new JSDOM(success); // convert response into DOM
dom.window.document.querySelectorAll('h1')
.forEach((htmlH1, i) => { // get all h1 html tags
if (htmlH1.includes('chocolate')) {
chocolateList.push({name:htmlH1});
}
});
console.log(chocolateList)
})
});
```
outputs an array of approximately 20 elements instead of 95 elements
what im doing wrong?
Try this code, it works. You made a few mistakes in the above, I explained it in the code comments section
const fetch = require('node-fetch');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var chocolateList = [];
urlArray = ['https://www.example.com', 'https://www.example2.com'];
urlArray.forEach((url) => {
fetch(url).then(res => res.text())
.then(async (success) => {
const dom = new JSDOM(success);
//h1 tags array length
let len = dom.window.document.querySelectorAll('h1').length;
//h1 tags array
let h1Arr = dom.window.document.querySelectorAll('h1');
// Check for a length of an h1 tags array
if (len) {
h1Arr.forEach((htmlH1, i) => {
let h1Text = htmlH1.textContent;
//The content that you are recieving is with lot of new-line and carriage return characters.
//So, you should always sanitize the data before proceeding, and covert it to lower case as includes search is a case sensitive search
h1Text = h1Text.replace(/[\r\n]/g, '').trim().toLowerCase();
if (h1Text.includes('chocolate')) {
chocolateList.push({ name: htmlH1 });
}
});
} else {
console.log('No h1 element present on DOM');
}
console.log(chocolateList);
})
});

All my scraped text ends up in one big object instead of separate objects with Cheerio

I'm following a web scraping course that uses Cheerio. I practice on a different website then they use in the course and now I run into the problem that all my scraped text end up in one big object. But every title should end up in it's own object. Can someone see what I did wrong? I already bumbed my head 2 hours on this problem.
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.aanbod").each((index, element) => {
const result = $(element).children(".item");
const title = result.find("h2").text().trim();
const characteristics = result.find("h4").text();
const scrapeResult = {title, characteristics};
scrapeResults.push(scrapeResult);
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();
This is the link to the repo: https://github.com/danielkroon/huurgoed-scraper/blob/master/index.js
Thanks!
That is because of the way you used selectors. I've modified your script to fetch the content as you expected. Currently the script is collecting titles and characteristics. Feel free to add the rest within your script.
This is how you can get the required output:
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.item").each((index, element) => {
const title = $(element).find(".kenmerken > h2").text().trim();
const characteristics = $(element).find("h4").text().trim();
scrapeResults.push({title,characteristics});
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();

node.js Get.Request & Pagination & Async

I'm having a tremendously tough time organizing the flow here as I'm self-taught so wondering if someone might be able to assist.
var channelIds = ['XYZ','ABC','QRS']
var playlistIds = [];
var videoIds = [];
ORDER OF PROCESS
1. Get All Playlist IDs: If returning Get Request JSON contains nextPageToken run Get Request again with that page before going to (2)
2. Get All Video IDs: If returning Get Request JSON contains nextPageToken run Get Request again with that page before going to (3)
3. Aggregate into Final Array: I need put all in an array such as:
var ArrFinal = [{channelId,playlistID,videoId},{channelId,playlistID,videoId},{channelId,playlistID,videoId}];
I don't necessarily need someone to write the whole thing. I'm trying to better understand the most efficient way to know when the previous step is done, but also handle the nextPageToken iteration.
i'm not familiar with the youtube api.
But what you basically need is a get function for each endpoint. This function should also care about the "nextPageToken".
Something like that: (not tested)
'use strict';
const Promise = require('bluebird');
const request = Promise.promisifyAll(require('request'));
const playlistEndpoint = '/youtube/v3/playlists';
const baseUrl = 'https://www.googleapis.com'
const channelIds = ['xy', 'ab', 'cd'];
const getPlaylist = async (channelId, pageToken, playlists) => {
const url = `${baseUrl}${playlistEndpoint}`;
const qs = { 
channelId,
maxResults: 25,
pageToken
};
try {
const playlistRequest = await request.getAsync({ url, qs });
const nextPageToken = playlistRequest.body.nextPageToken;
// if we already had items, combine with the new ones
const items = playlists ? playlists.concat(playlistRequest.body.items) : playlistRequest.body.items;
if (nextPageToken) {
// if token, do the same again and pass results to function
return getPlaylist(channelId, nextPageToken, items);
}
// if no token we are finished
return items;
}
catch (e) {
console.log(e.message);
}
};
const getVideos = async (playlistId, pageToken, videos) => {
// pretty much the same as above
}
function awesome(channelIds) {
const fancyArray = [];
await Promise.map(channelIds, async (channelId) => {
const playlists = await getPlaylist(channelId);
const videos = await Promise.map(playlists, async (playlistId) => {
const videos = await getVideos(playlistId);
videos.forEach(videoId => {
fancyArray.push({ channelId, playlistId, videoId })
})
});
});
return fancyArray;
}
awesome(channelIds)
// UPDATE
This may be a lot concurrent requests, you can limit them by using
Promise.map(items, item => { somefunction() }, { concurrency: 5 });

Resources