How to scrape multi-level links using puppeteer js? - node.js

I am scraping table rows of site page using Puppeteer. I have the code to scrape content and assign them to an object for each in the table. In each table row there is a link that I need to open in a new page (puppeteer) and then scrape for a particular element then assign it to the same object and return the whole object with the new keys to puppeteer. How is that possible with Puppeteer?
async function run() {
const browser = await puppeteer.launch({
headless: false
})
const page = await browser.newPage()
await page.goto('https://tokenmarket.net/blockchain/', {waitUntil: 'networkidle0'})
await page.waitFor(5000)
var onlink = ''
var result = await page.$$eval('table > tbody tr .col-actions a:first-child', (els) => Array.from(els).map(function(el) {
//running ajax requests to load the inner page links.
$.get(el.children[0].href, function(response) {
onlink = $(response).find('#page-wrapper > main > div.container > div > table > tbody > tr > td:nth-child(2)').text()
})
return {
icoImgUrl: el.children[0].children[0].children[0].currentSrc,
icoDate: el.children[2].innerText.split('\n').shift() === 'To be announced' ? null : new Date( el.children[2].innerText.split('\n').shift() ).toISOString(),
icoName:el.children[1].children[0].innerText,
link:el.children[1].children[0].children[0].href,
description:el.children[3].innerText,
assets :onlink
}
}))
console.log(result)
UpcomingIco.insertMany(result, function(error, docs) {})
browser.close()
}
run()

If you try opening a new tab for each ICO page in parallel you might end up with 100+ pages loading at the same time.
So the best thing you could do is to first collect the URLs and then visit them one by one in a loop.
This also allows keeping the code simple and readable.
For example (please, see my comments):
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://tokenmarket.net/blockchain/');
// Gather assets page urls for all the blockchains
const assetUrls = await page.$$eval(
'.table-assets > tbody > tr .col-actions a:first-child',
assetLinks => assetLinks.map(link => link.href)
);
const results = [];
// Visit each assets page one by one
for (let assetsUrl of assetUrls) {
await page.goto(assetsUrl);
// Now collect all the ICO urls.
const icoUrls = await page.$$eval(
'#page-wrapper > main > div.container > div > table > tbody > tr > td:nth-child(2) a',
links => links.map(link => link.href)
);
// Visit each ICO one by one and collect the data.
for (let icoUrl of icoUrls) {
await page.goto(icoUrl);
const icoImgUrl = await page.$eval('#asset-logo-wrapper img', img => img.src);
const icoName = await page.$eval('h1', h1 => h1.innerText.trim());
// TODO: Gather all the needed info like description etc here.
results.push([{
icoName,
icoUrl,
icoImgUrl
}]);
}
}
// Results are ready
console.log(results);
browser.close();

Related

Page returning same results even though the results are different each time. (Node, puppeteer, cheerio)

Im scraping a dynamic website, which doesn't instantly load a list full of items.
I'm waiting until that content loads then i'm getting the pagination list and extracting the last items value for the page count.
once this is done i'm using a for loop to go through this website x amount of times based on the page count for that list of items.
once the first loop is complete and the data is gathered... i click the next button to move on to the next page, i await the data to change, then go back through the for loop to gather the new data in that list.
For some reason, the results returned is the same for each page, even though they're not the same, and i can see with using headless: false that the data is in-fact changing so why isn't it returning that new data?
`
try {
const browser = await puppeteer.launch(
{
headless: false
}
);
const page = await browser.newPage();
await page.goto(url, {
waitUntil: "networkidle0",
}).catch((err) => console.log("error loading url", err));
await page.waitForSelector(selector);
const pageData = await page.evaluate(() => {
return {
html: document.documentElement.innerHTML,
}
});
const $ = cheerio.load(pageData.html);
for (let i = 0; i < pageCount; i++) {
await page.waitForSelector(selector);
$('.itemDivWrapper > div').each(async (i, g) => {
let itemName = $(g).find('a > div > h3').text();
console.log(itemName);
});
await Promise.all([
page.$eval('div.itemList > nav > ul > li.nextButton > a', element =>
element.click()
),
await page.waitForTimeout(5000);
]);
}
await browser.close();
} catch (error) {
console.log(error);
}
`
Maybe i'm just doing something silly wrong :'D and fresh eyes are needed :D.
Expect each loop to provide the new page data, which 100% showed before it tried to get that data. but returns the same values each time.

Using Puppeteer to collect links of a page and open those links to scrape data

I have to scrape data of products from a list, but the data resides in the pages of those products. For example, let's say I'd like to fetch the .product-image of the product page.
The code so far successfully fetches each URL and adds to an array, but I'm not sure where to go from here, as using page returns Error: Evaluation failed: ReferenceError: page is not defined when using it outside of urls for some reason.
const page = await browser.newPage();
page.waitForNavigation({ timeout: 0, waitUntil: "domcontentloaded" });
await page.goto(siteSearchUrl + 1);
await page.screenshot({ path: "carpet.png" });
const urls = await page.evaluate(() => {
const items = document.querySelectorAll("li.product-item");
const productLinks = [];
console.log(page);
items.forEach((item) => {
const productLink = item
.querySelector("a.product-image") // NOT the product image, it's the link.
.getAttribute("href");
productLinks.push(`${siteUrl + productLink}`);
});
productLinks.forEach((link) => {
page.screenshot({ path: link + ".png" });
}, page);
});
await browser.close();
return Promise.resolve(urls);
})();
How exactly do I collect the data from each of the links?
It is not possible to use page inside of page.evaluate.
You get ReferenceError: page is not defined due to page.evaluate executes the script in the page context (e.g. your console.log(page) is logged into the Chromium's console, and not to Node's console), other puppeteer methods (like page.screenshot) cannot be run on the client side, but in puppeteer only. So you need to move it outside.
E.g.:
const urls = await page.evaluate(() => {
const productLinks = []
...
productLinks.push(...)
...
return productLinks
})
for (const url of urls) {
await page.goto(url)
await page.screenshot({ path: url + '.png' })
}
Edit
I fixed the example above. You may find you are able to solve the problem by using page.$eval, page.$$eval more concisely.
const hrefs = await page.$$eval('li.product-item > a.product-image', elements => elements.map(el => el.href))
const urls = hrefs.map(el => siteUrl + el)
for (const url of urls) {
await page.goto(url)
await page.screenshot({ path: url + '.png' })
}

puppeteer to cheerio scraping from dynamic website for specific data

i wanted to scrape certain data from a mutual fund website where i can track only selective funds instead of all of them.
so i tried to puppeteer to scrape the dynamic table generated by the website. I manage to get the table but when i try to parse it to cheerio, seems like nothing happen
const scrapeImages = async (username) => {
console.log("test");
const browser = await puppeteer.launch({
args: ['--no-sandbox']
});
const page = await browser.newPage();
await page.goto('https://www.publicmutual.com.my/Our-Products/UT-Fund-Prices');
await page.waitFor(5000);
const data = await page.evaluate( () => {
const tds = Array.from(document.querySelectorAll('div.form-group:nth-child(4) > div:nth-child(1) > div:nth-child(1)'))
return tds.map(td => td.innerHTML)
});
await browser.close();
console.log(data);
let $ = cheerio.load(data);
$('table > tbody > tr > td').each((index, element) => {
console.log($(element).text());
});
};
scrapeImages("test");
ultimately i am not sure how can i do this directly with puppeteer only instead of directing to cheerio for the scraping and also i would like to scrape only selected funds for instance, if you visit the web here https://www.publicmutual.com.my/Our-Products/UT-Fund-Prices
i would like to get only funds from abbreviation
PAIF
PAGF
PCIF
instead of all of them. not sure how can i do this with only puppeteer?
That page has jQuery already which is even better than cheerio:
const rows = await page.evaluate( () => {
return $('.fundtable tr').get().map(tr => $(tr).find('td').get().map(td => $(td).text()))
}

Click on random Google Search result using NodeJS and Puppeteer?

I'm attempting on making a small script to click on a random Google Search result after searching "'what is ' + Word." Nothing I've done has been able to get me the results I want, heck, I can't even get the script to click a single Google Search result!
I've tried doing multiple things here, such as collecting all search results in an array and clicking a random one (didn't collect into an array), clicking an element by partial text (https:// brought no results), and many other solutions that work in Python, but don't work here.
const puppeteer = require('puppeteer');
const searchbar = "#tsf > div:nth-child(2) > div > div.RNNXgb > div > div.a4bIc > input"
async function gsearch() {
const browser = await puppeteer.launch({headless:false, args:['--no-sandbox', '--disable-setuid-sandbox']});
const page = await browser.newPage();
await page.goto('https://google.com');
var fs = require("fs");
var array = fs.readFileSync("words.txt").toString().split('\n');
var random = array[Math.floor(Math.random() * array.length)]
await page.click(searchbar)
await page.keyboard.type("what is " + random);
await page.waitFor(1000);
await page.evaluate(() => {
let elements = $('LC20lb').toArray();
for (i = 0; i < elements.length; i++) {
$(elements[i]).click();
}
})
}
gsearch();
(ignore any indent-inheritant errors, I swear it looks cleaner in VSC)
Expected to click a random search result. End up getting nothing done, maybe an error or two but that's about it.
LC20lb is not html tag and it should be class name for h3 and by using$() are you trying to select elements with jQuery? use document.querySelectorAll() instead.
const puppeteer = require('puppeteer');
const fs = require("fs");
async function gsearch() {
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.goto('https://google.com');
var array = fs.readFileSync("words.txt").toString().split('\n');
var random = array[Math.floor(Math.random() * array.length)];
// simple selector for search box
await page.click('[name=q]');
await page.keyboard.type("what is " + random);
// you forgot this
await page.keyboard.press('Enter');
// wait for search results
await page.waitForSelector('h3.LC20lb', {timeout: 10000});
await page.evaluate(() => {
let elements = document.querySelectorAll('h3.LC20lb')
// "for loop" will click all element not random
let randomIndex = Math.floor(Math.random() * elements.length) + 1
elements[randomIndex].click();
})
}

Overcoming pagination when using puppeteer (library) for web-scraping

I am using Puppeteer to build a basic web-scraper and so far I can return all the data I require from any given page, however when pagination is involved my scraper comes unstuck (only returning the 1st page).
See example - this returns Title/Price for 1st 20 books, but doesn't look at the other 49 pages of books.
Just looking for guidance on how to overcome this - I can't see anything in the docs.
Thanks!
const puppeteer = require('puppeteer');
let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('http://books.toscrape.com/');
const result = await page.evaluate(() => {
let data = [];
let elements = document.querySelectorAll('.product_pod');
for (var element of elements){
let title = element.childNodes[5].innerText;
let price = element.childNodes[7].children[0].innerText;
data.push({title, price});
}
return data;
});
browser.close();
return result;
};
scrape().then((value) => {
console.log(value);
});
To be clear. I am following a tutorial here - this code comes from Brandon Morelli on codeburst.io!! https://codeburst.io/a-guide-to-automating-scraping-the-web-with-javascript-chrome-puppeteer-node-js-b18efb9e9921
I was following same article in order to educate myself on how to use Puppeteer.
Short answer on your question is that you need to introduce one more loop to iterate over all available pages in online book catalogue.
I've done following steps in order to collect all book titles and prices:
Extracted page.evaluate part in separate async function that takes page as argument
Introduced for-loop with hardcoded last catalogue page number (you can extract it with help of Puppeteer if you wish)
Placed async function from step one inside a loop
Same exact code from Brandon Morelli article, but now with one extra loop:
const puppeteer = require('puppeteer');
let scrape = async () => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto('http://books.toscrape.com/');
var results = []; // variable to hold collection of all book titles and prices
var lastPageNumber = 50; // this is hardcoded last catalogue page, you can set it dunamically if you wish
// defined simple loop to iterate over number of catalogue pages
for (let index = 0; index < lastPageNumber; index++) {
// wait 1 sec for page load
await page.waitFor(1000);
// call and wait extractedEvaluateCall and concatenate results every iteration.
// You can use results.push, but will get collection of collections at the end of iteration
results = results.concat(await extractedEvaluateCall(page));
// this is where next button on page clicked to jump to another page
if (index != lastPageNumber - 1) {
// no next button on last page
await page.click('#default > div > div > div > div > section > div:nth-child(2) > div > ul > li.next > a');
}
}
browser.close();
return results;
};
async function extractedEvaluateCall(page) {
// just extracted same exact logic in separate function
// this function should use async keyword in order to work and take page as argument
return page.evaluate(() => {
let data = [];
let elements = document.querySelectorAll('.product_pod');
for (var element of elements) {
let title = element.childNodes[5].innerText;
let price = element.childNodes[7].children[0].innerText;
data.push({ title, price });
}
return data;
});
}
scrape().then((value) => {
console.log(value);
console.log('Collection length: ' + value.length);
console.log(value[0]);
console.log(value[value.length - 1]);
});
Console output:
...
{ title: 'In the Country We ...', price: '£22.00' },
... 900 more items ]
Collection length: 1000
{ title: 'A Light in the ...', price: '£51.77' }
{ title: '1,000 Places to See ...', price: '£26.08' }

Resources