Puppeteer : use second match with page.evaluate - node.js

i'm using puppeteer to retrieve datas online, and facing an issue.
Two functions have the same name and return serialized object, the first one returns an empty object, but the second one does contains the datas i'm targeting.
My question is, how can I proceed to select the second occurence of the function instead of the first one, which return an empty object.
Thanks.
My code :
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const Variants = require('./variants.js');
const Feedback = require('./feedback.js');
async function Scraper(productId, feedbackLimit) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
/** Scrape page for details */
await page.goto(`${productId}`);
const data = (await page.evaluate()).match(/window.runParams = {"result/)
const data = data.items
await page.close();
await browser.close();
console.log(data);
return data;
}
module.exports = Scraper;
Website source code :
window.runParams = {};
window.runParams = {"resultCount":19449,"seoFeaturedSnippet":};

Please try this, it should work.
const data = await page.content();
const regexp = /window.runParams/g;
const matches = string.matchAll(regexp);
for (const match of matches) {
console.log(match);
console.log(match.index)
}

Related

Trying to webscrape using Puppeteer in NodeJS. I keep getting this error while scraping however. I copied the XPATH of the element exactly how it is

TimeoutError: Waiting for selector .//*[#id="main-content"]/section[1]/div/div/section[1]/div/div[2]/section/div/ul[2]/li[1] failed: Waiting failed: 30000ms exceeded
This is the error I keep getting. I am copying the XPATH exactly how it is. I am currently trying to scrape job postings for fun on linkedin and this is the "job requirements" section. I am only trying to get the first list item and it keeps saying theres an error
Here is my code:
const puppeteer = require('puppeteer');
const xlsx = require('xlsx');
async function webScraper(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const JobLink = url;
await page.goto(url);
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h1')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[3]/span')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h4/div[1]/span[1]/a')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[4]/span')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[2]/span')
await page.waitForXPath('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h4/div[1]/span[2]')
**await page.waitForXPath('//*[#id="main-content"]/section[1]/div/div/section[1]/div/div[2]/section/div/ul[2]/li[1]')**
const [posName] = await page.$x('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h1')
const posSrc = await posName.getProperty('textContent')
const positionName = await posSrc.jsonValue();
const [type] = await page.$x('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[3]/span')
const typeSrc = await type.getProperty('textContent')
const JobType = await typeSrc.jsonValue();
const [company] = await page.$x('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h4/div[1]/span[1]/a')
const companySrc = await company.getProperty('textContent')
const companyName = await companySrc.jsonValue();
const [industry] = await page.$x('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[4]/span')
const industrySrc = await industry.getProperty('textContent')
const industryType = await industrySrc.jsonValue();
const [comit] = await page.$x('//*[#id="main-content"]/section[1]/div/div/section[1]/div/ul/li[2]/span')
const comitSrc = await comit.getProperty('textContent')
const committment = await comitSrc.jsonValue();
const [location] = await page.$x('//*[#id="main-content"]/section[1]/div/section[2]/div/div[1]/div/h4/div[1]/span[2]')
const locationSrc = await location.getProperty('textContent')
const joblocation = await locationSrc.jsonValue();
** const [requirement] = await page.$x('//*[#id="main-content"]/section[1]/div/div/section[1]/div/div[2]/section/div/ul[2]/li[1]')
const requireSrc = await requirement.getProperty('textContent')
const jobRequirements = await requireSrc.jsonValue();**
return {positionName, JobType, JobLink, companyName, industryType, committment, joblocation, jobRequirements}
}
I tried going to another job posting, but the XPATH is the exact same. Also I was wondering if there was anyway to make it so that if the browser isnt able to wait for an XPATH, it will just skip over it and continue on to the next piece of data since I have to restart my script everytime something like this happens.

How to evaluate a relative XPath inside another XPath in Puppeteer?

Here is my code where I have got the element Handle of some target divs
const puppeteer = require("puppeteer");
(async () => {
const searchString = `https://www.google.com/maps/search/restaurants/#-6.4775265,112.057849,3.67z`;
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(searchString);
const xpath_expression ='//div[contains(#aria-label, "Results for")]/div/div[./a]';
await page.waitForXPath(xpath_expression);
const targetDivs = await page.$x(xpath_expression);
// const link_urls = await page.evaluate((...targetDivs) => {
// return targetDivs.map((e) => {
// return e.textContent;
// });
// }, ...targetDivs);
})();
I have two relative XPath links inside these target Divs which contain related data
'link' : './a/#href'
'title': './a/#aria-label'
I have a sample of similar python code like this
from parsel import Selector
response = Selector(page_content)
results = []
for el in response.xpath('//div[contains(#aria-label, "Results for")]/div/div[./a]'):
results.append({
'link': el.xpath('./a/#href').extract_first(''),
'title': el.xpath('./a/#aria-label').extract_first('')
})
How to do it in puppeteer?
I think you can get the href and ariaLabel property values with e.g.
const targetDivs = await page.$x(xpath_expression);
targetDivs.forEach(async (div, pos) => {
const links = await div.$x('a[#href]');
const href = await (await links[0].getProperty('href')).jsonValue();
const ariaLabel = await (await links[0].getProperty('ariaLabel')).jsonValue();
console.log(pos, href, ariaLabel);
});
These are the element properties, not the attribute values, which, in the case of href, might for instance mean you get an absolute instead of a relative URL but I haven't checked for that particular page whether it makes a difference. I am not sure the $x allows direct attribute node or even string value selection, the documentation only talks about element handles.

Error: Evaluation failed: ReferenceError: _babel_runtime_helpers_toConsumableArray__WEBPACK_IMPORTED_MODULE_1___default is not defined

I am getting this error, when I try to run the script (which uses webpack)
Error: Evaluation failed: ReferenceError: _babel_runtime_helpers_toConsumableArray__WEBPACK_IMPORTED_MODULE_1___default is not defined at __puppeteer_evaluation_script__:2:27
but when I run same code which doesn't use webpack I got the expected result.
here is my function.
const getMeenaClickProducts = async (title) => {
const url = ` ${MEENACLICK}/${title}`;
console.log({ url });
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
await page.waitForSelector('.ant-pagination-total-text');
const products = await page.evaluate(() => {
const cards = [...document.querySelectorAll('.card-thumb')];
console.log({ cards });
return cards.map((card) => {
const productTitle = card.querySelector('.title').innerText;
const priceElement = card.querySelector('.reg-price');
const price = priceElement ? priceElement.innerText : '';
const image = card.querySelector('.img').src;
const link = card.querySelector('.main-link').href;
return {
title: productTitle,
price,
image,
link,
};
});
});
await browser.close();
const filteredProducts = products
.filter((product) =>
product.title.toLowerCase().includes(title.toLowerCase())
)
.filter((item) => item.price);
return filteredProducts;
};
what could be the reason?
The problem is with Babel, and with this part:
const products = await page.evaluate(() => {
const cards = [...document.querySelectorAll('.card-thumb')];
console.log({ cards });
return cards.map((card) => {
const productTitle = card.querySelector('.title').innerText;
const priceElement = card.querySelector('.reg-price');
const price = priceElement ? priceElement.innerText : '';
const image = card.querySelector('.img').src;
const link = card.querySelector('.main-link').href;
return {
title: productTitle,
price,
image,
link,
};
});
});
The inside of the page.evaluate() script you are passing as a function parameter, is not the actual code that is being passed to the page instance, because first you are using babel to transform it.
The array spread operator you have in this part:
const cards = [...document.querySelectorAll('.card-thumb')];
Is most likely being transformed in your build to a function named _babel_runtime_helpers_toConsumableArray__WEBPACK_IMPORTED_MODULE_1___default, which is then passed to the puppeteer page context, and ultimately executed in that page. But such function is not defined in that context, that's why you get a ReferenceError.
Some options to fix it:
Don't use the spread operator combined with the current babel config you are using, so the transformed build doesn't includ a polyfill/replacement of it. Think of a replacement with an equivalent effect, such as:
const cards = Array.from(document.querySelectorAll('.card-thumb'));
Or more traditional for / forEach() loops and build up the array yourself will get job done.
Update your babel config / target language level to support the spread operator natively.

All my scraped text ends up in one big object instead of separate objects with Cheerio

I'm following a web scraping course that uses Cheerio. I practice on a different website then they use in the course and now I run into the problem that all my scraped text end up in one big object. But every title should end up in it's own object. Can someone see what I did wrong? I already bumbed my head 2 hours on this problem.
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.aanbod").each((index, element) => {
const result = $(element).children(".item");
const title = result.find("h2").text().trim();
const characteristics = result.find("h4").text();
const scrapeResult = {title, characteristics};
scrapeResults.push(scrapeResult);
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();
This is the link to the repo: https://github.com/danielkroon/huurgoed-scraper/blob/master/index.js
Thanks!
That is because of the way you used selectors. I've modified your script to fetch the content as you expected. Currently the script is collecting titles and characteristics. Feel free to add the rest within your script.
This is how you can get the required output:
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.item").each((index, element) => {
const title = $(element).find(".kenmerken > h2").text().trim();
const characteristics = $(element).find("h4").text().trim();
scrapeResults.push({title,characteristics});
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();

Get href attribute in pupeteer Node.js

I know the common methods such as evaluate for capturing the elements in puppeteer, but I am curious why I cannot get the href attribute in a JavaScript-like approach as
const page = await browser.newPage();
await page.goto('https://www.example.com');
let links = await page.$$('a');
for (let i = 0; i < links.length; i++) {
console.log(links[i].getAttribute('href'));
console.log(links[i].href);
}
await page.$$('a') returns an array with ElementHandles — these are objects with their own pupeteer-specific API, they have not usual DOM API for HTML elements or DOM nodes. So you need either retrieve attributes/properties in the browser context via page.evaluate() or use rather complicated ElementHandles API. This is an example with both ways:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
// way 1
const hrefs1 = await page.evaluate(
() => Array.from(
document.querySelectorAll('a[href]'),
a => a.getAttribute('href')
)
);
// way 2
const elementHandles = await page.$$('a');
const propertyJsHandles = await Promise.all(
elementHandles.map(handle => handle.getProperty('href'))
);
const hrefs2 = await Promise.all(
propertyJsHandles.map(handle => handle.jsonValue())
);
console.log(hrefs1, hrefs2);
await browser.close();
} catch (err) {
console.error(err);
}
})();
const yourHref = await page.$eval('selector', anchor => anchor.getAttribute('href'));
but if are working with a handle you can
const handle = await page.$('selector');
const yourHref = await page.evaluate(anchor => anchor.getAttribute('href'), handle);
I don't know why it's such a pain, but this was found when I encountered this a while ago.
async function getHrefs(page, selector) {
return await page.$$eval(selector, anchors => [].map.call(anchors, a => a.href));
}
A Type safe way of returning an array of strings as the hrefs of the links by casting using the HTMLLinkElement generic for TypeScript users:
await page.$$eval('a', (anchors) => anchors.map((link) => (link as HTMLLinkElement).href));
A simple way to get an href from an anchor element
Say you fetched an anchor element with the following
const anchorElement = await page.$('a') // or page.$<HTMLAnchorElement>('a') if using typescript
You can get the href property with the following
const href = anchorElement.evaluate(element => element.href)

Resources