So I have the following code launching
this.browser = await puppeteer.launch( { headless: false, devtools: true, slowMo: 200});
this.page = await this.browser.newPage();
await this.page.goto(pageUrl);
let result = await this.page.evaluate(() => {
const labels = document.querySelectorAll("li.product-item");
let productList = [];
for(let product of labels) {
productList.push(product);
}
debugger;
//filter only product stacks that have a price
const productStacks = productList.filter(product => product.querySelector("span.price-wrapper") !== null);
let results = productStacks.map(product => {
return product.querySelector("span.price-wrapper").getAttribute("data-price-amount");
});
return results;
});
So chromium starts up and pauses at the appropriate code (as best as I can tell), I can even see the local variables populate with the expect result and step through the code, however the open file puppeteer_evaluation_script is not populated with the evaluation script and remains with the contents, so I'm stepping through blind.
//# sourceURL=__puppeteer_evaluation_script__
Occasionally after many minutes it sometimes does actually populate with the code. I have no idea what's wrong, I've tried updating the latest node lts and puppeteer but have the same behavior.
I don't know what causes this issue, but here is a pair of possible solutions.
To avoid getting:
//# sourceURL=__puppeteer_evaluation_script__
You can expose a function:
const puppeteer = require('puppeteer');
var browser = null;
var page = null;
(async () =>
{
browser = await puppeteer.launch(
{
headless: false,
devtools: true,
});
page = await browser.newPage();
await page.goto("https://google.com/");
// Expose a function
page.exposeFunction("nothing", () => null);
await page.evaluate(async function()
{
debugger;
console.log("Do task");
});
})();
In case that fails in the future. I made a wrapper for using eval() since the source appears using that.
It works with both sync and async functions and supports passing args and returning values.
function evaluateFixed(page, realFunction)
{
return page.evaluate(async function(realFunction, args)
{
var func = eval(`(${realFunction})`);
if(func.constructor.name === "AsyncFunction")
return await func(...args);
else
return func(...args);
},
realFunction.toString(), Array.from(arguments).slice(2));
}
(async () =>
{
browser = await puppeteer.launch(
{
headless: false,
devtools: true,
});
page = await browser.newPage();
await page.goto("https://google.com/");
console.log("Doing test");
let res = await evaluateFixed(page, async function(x, y, z)
{
debugger;
console.log("Do task", x, y, z);
function sleep(amount)
{
return new Promise((resolve) => setTimeout(resolve, amount));
}
for(let i = 0; i < 10; i++)
{
console.log("on seconds", i);
await sleep(1000);
}
return { "fee": "foo" };
}, 1, "two", { "three": 3});
console.log("Res 1", res);
res = await evaluateFixed(page, () =>
{
debugger;
return 1 + 2;
});
console.log("Res 2", res);
})();
I was experiencing the same problem.
I was thinking it could be the timeout to open external files was not sufficient.
So i have added the parameter 'slowMo: 1000' and it solved for me.
Good Luck.
Related
This question already has answers here:
Puppeteer - scroll down until you can't anymore
(11 answers)
Closed 2 years ago.
I am trying to scrape a website which has infinite scrolling.
I am controlling the scroll but still, it exits after reaching at the end of the webpage.
This is my code:
const puppeteer = require("puppeteer");
module.exports.scraper = async (url, callBack) => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
);
await page.setViewport({ width: 1200, height: 768 });
function wait(ms) {
return new Promise((resolve) => setTimeout(() => resolve(), ms));
}
await page.goto(`${url}/products/?department=men&l2_category=polos-t-shirts`, {
waitUntil: "networkidle0",
});
// Get the height of the rendered page
const bodyHandle = await page.$("body");
const { height } = await bodyHandle.boundingBox();
await bodyHandle.dispose();
// Scroll one viewport at a time, pausing to let content load
const viewportHeight = page.viewport().height;
let viewportIncr = 0;
while (viewportIncr + viewportHeight < height) {
await page.evaluate((_viewportHeight) => {
window.scrollBy(0, _viewportHeight);
}, viewportHeight);
await wait(1600);
viewportIncr = viewportIncr + viewportHeight;
}
let data = await page.evaluate(() => {
window.scrollTo(0, 0);
let products = [];
let productElements = document.querySelectorAll(".product-wrap");
productElements.forEach((productElement) => {
let productJson = {};
try {
productJson.imageUrl = productElement.querySelector(".renderedImg").src;
productJson.brandName = productElement.querySelector(
".brand-name",
).innerText;
} catch (e) {
console.log(e);
}
products.push(productJson);
});
return products;
});
await wait(100);
callBack(data, true);
await browser.close();
};
How to scrape in such situation?
Here's one strategy to handle infinite scrolling. It repeats a scroll/compare in a loop until scrolling has no effect. i.e. when we tell it to scroll, but we're still at the same scrollTop value we were last iteration, consider it done. In extreme cases the browser will eventually run out of heap memory and crash, but this is our starting point for the average site:
const puppeteer = require('puppeteer');
const url = 'https://example.com';
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('console', async msg => {
const args = msg.args();
const vals = [];
for (let i = 0; i < args.length; i++) {
vals.push(await args[i].jsonValue());
}
console.log(vals.join('\t'));
});
await page.goto(url);
await page.evaluate(()=> {
const wait = (duration) => {
console.log('waiting', duration);
return new Promise(resolve => setTimeout(resolve, duration));
};
(async () => {
window.atBottom = false;
const scroller = document.documentElement; // usually what you want to scroll, but not always
let lastPosition = -1;
while(!window.atBottom) {
scroller.scrollTop += 1000;
// scrolling down all at once has pitfalls on some sites: scroller.scrollTop = scroller.scrollHeight;
await wait(300);
const currentPosition = scroller.scrollTop;
if (currentPosition > lastPosition) {
console.log('currentPosition', currentPosition);
lastPosition = currentPosition;
}
else {
window.atBottom = true;
}
}
console.log('Done!');
})();
});
await page.waitForFunction('window.atBottom == true', {
timeout: 900000,
polling: 1000 // poll for finish every second
});
await page.screenshot({path: 'example.png', fullPage: true});
await browser.close();
})();
My task is form submission with different data. So, I am using puppeteer and for of loop.
Code example:
const puppeteer = require('puppeteer')
const data = require('data.json') // ~30 products
(async () => {
try {
const browser = await puppeteer.launch({
headless: false,
ignoreHTTPSErrors: true,
defaultViewport: null,
});
const page2 = await browser.newPage();
await page2.goTo('mywebsite', {waitUntil: 'domcontentloaded'} )
for (let product of data) {
// Waiting for some selector, after that do something with it
await page2.waitForSelector("#someSelector", { visible: true });
await page2.type("#someSelector", product.someData);
//
//... A lot of code that similar to above is here
//
// Go back after all things done
await page2.waitFor(2000);
await page2.waitForSelector('[title="home"]', { visible: true });
await page2.click('[title="home"]', { clickCount: 1 });
counter++;
console.log(
`===========================================================================${counter}`
);
}
} catch (err) {
throw new Error(err);
}
})();
The problem is that this is works, but not always works, for example, a loop can work 15 times, and then fall off or go through a full cycle without failing.
The error is always the same:
UnhandledPromiseRejectionWarning: Error: TimeoutError: waiting for selector "#someSelector" failed: timeout 30000ms exceeded
However, if I check the page, then everything is there, the elements are on the page, but puppeteer does not see them. How Can I fix this?
My current solution for this is a retry function:
const chalk = require("chalk");
const util = require("util");
const delay = util.promisify(setTimeout);
async function retry(fn, retryDelay = 200, numRetries = 5) {
for (let i = 0; i < numRetries; i++) {
try {
return await fn();
} catch (err) {
console.log(
"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
);
console.log(chalk.yellow(err));
if (i === numRetries - 1) throw err;
await delay(retryDelay);
retryDelay = retryDelay * 2;
}
}
}
For some reason my script will return once it reaches the "ticker" setinterval loop instead of doing whats inside of the loop and then returning.
Why is it returning early instead of doing the setinerval?
The javascript code
const puppeteer = require('puppeteer');
async function test(url3) {
let counter = 0;
try {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null
});
const page = await browser.newPage();
const url = 'https://yandex.com/images/';
await page.goto(url);
await page.evaluate(() => {
document.querySelector('.input__cbir-button').firstElementChild.click();
})
await page.focus('input[name="cbir-url"]')
await page.keyboard.type(url3)
await page.keyboard.press('Enter');
page
.on('response', response =>{
//console.log(`${response.url()}`)
if(`${response.url()}` == "https://yandex.com/clck/jclck/"){
counter++;
}
})
SetInterval function where it messes up
var ticker = setInterval(async () => {
if(counter === 2){
clearInterval(ticker)
if(page.$('.cbir-no-results__content' !== null)){
await browser.close();
return('no images found');
} else {
await page.screenshot({path: 'test.png', fullPage: true});
await browser.close();
}
}
})
} catch(err) {
//console.log(err)
}
}
Return
test("animage.jpg").then(ans => {
console.log(ans)
})
setInterval will always return a number: the id of the timer that you can then pass to clearInterval. What you likely want is to wrap the entire thing in a Promise which it then resolved when your condition is met:
// no need for async, we're explicitly returning a Promise
function test() {
return new Promise(resolve => {
const handle = setInterval(() => {
if (conditionMet) {
clearInterval(handle);
resolve();
}
}, 100); // or however often
});
}
This is one of the few times where it is correct to explicitly construct a Promise.
I ran the following and it appears to gather a large number of links, however on actual inspection of the site with collectLinks1 I get all valid links, but with collectLinks2 I got 59 iterations of http://pieroxy.net/blog/2014/11/18/[
I'm new to Puppeteer and I can't find out why with collectLinks2 I don't get the links.
const { parse, resolve } = require('url');
const trim = require('lodash/trim');
const startsWith = require('lodash/startsWith');
const includes = require('lodash/includes');
// https://github.com/GoogleChrome/puppeteer
const puppeteer = require('puppeteer');
// https://github.com/gwuhaolin/chrome-finder
const findChrome = require('chrome-finder');
function resolveUrl(url, baseUrl) {
url = trim(url);
if (!url) return null;
if (startsWith(url, '#')) return null;
const { protocol } = parse(url);
if (includes(['http:', 'https:'], protocol)) {
return url.split('#')[0];
} if (!protocol) {
return resolve(baseUrl, url).split('#')[0];
}
return null;
}
async function collectLinks1(htmlPage) {
const baseUrl = htmlPage.url();
const links = [];
const assetUrls = await htmlPage.$$eval('a[href]', assetLinks => assetLinks.map(link => link.href));
assetUrls.forEach(link => {
const _link = resolveUrl(link, baseUrl);
if (_link) links.push(_link);
});
return links;
}
async function collectLinks2(htmlPage) {
const baseUrl = htmlPage.url();
const links = [];
await htmlPage.exposeFunction('pushToLinks', link => {
const _link = resolveUrl(link, baseUrl);
if (_link) links.push(_link);
});
await htmlPage.evaluate(() => {
function findLinks(document) {
document.querySelectorAll('a[href]')
.forEach(link => {
window.pushToLinks(link.href);
});
}
findLinks(window.document);
});
return links;
}
const crawl = async url => {
try {
console.log(`Crawling ${url}`);
const browser = await puppeteer.launch({
headless: false,
executablePath: findChrome(),
});
const page = await browser.newPage();
await page.goto(url);
// OK
const links1 = await collectLinks1(page);
links1.forEach(link => { console.log(link); });
// KO
const links2 = await collectLinks2(page);
links2.forEach(link => { console.log(link); });
await browser.close();
} catch (err) {
console.log(err);
}
};
crawl('http://pieroxy.net/blog/2014/11/18/user_agent_detection_in_java.html');
You need to await the function defined via page.exposeFunction as it returns a Promise. As you are only calling the function but not awaiting its result, your page.evaluate call will resolve before your script finished executing.
Solution
Instead of the forEach, you should use a loop to iterate over all the items and communicate them to the page one after another.
async function collectLinks2(htmlPage) {
// ...
await htmlPage.evaluate(async () => {
async function findLinks(document) {
for (const link of document.querySelectorAll('a[href]')) {
await window.pushToLinks(link.href);
}
}
await findLinks(window.document);
});
return links;
}
Using puppeteer to collect data from 2 different webpages into arrays for later comparison. However the program does not wait for the returned array before carrying forward.
async function go(){
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++){
const td = tds[i];
const tdcontent = await page.evaluate(td => td.innerText, td);
if (tdcontent.length > 5) {
data[i] = {"content": tdcontent};
}
}
return data;
} catch (e) {
console.log(e);
}
};
(async function main(){
const returnedData = await go();
console.log(returnedData.length);
})();
The return data.length is 0. New to nodejs, and async programming structure. I think it is because the .length is logged before the data is returned?
how do I return the data in a way where can manipulate it and complete my comparisons?
I try to not use page.$$ in such cases. Instead I use document.querySelectorAll and map thru the elements and extract the text.
Here is the modified code:
const getTdData = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("http://example.com");
return page.evaluate(() => {
// get all td elements
const tdList = [...document.querySelectorAll("td")];
return tdList.map(element => ({ content: element.textContent }));
});
} catch (e) {
console.log(e);
}
};
(async function main() {
const returnedData = await getTdData();
console.log(returnedData.length);
})();
First of all, you are missing an apostrophe in your page.$$() function. You should change this to:
const tds = await page.$$('td');
Next, you are trying to pass a non-existent variable to page.evaluate(). You can fix this by passing tds[i] instead of td:
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
Your final result should look something like this:
const go = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++) {
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
if (tdcontent.length > 5) {
data[i] = {
content: tdcontent,
};
}
}
return data;
} catch (error) {
console.log(error);
}
};
(async function main() {
const returnedData = await go();
console.log(returnedData.length);
})();
If you are are still experiencing issues, you may want to wait until the page has loaded completely using page.goto( ... , { waitUntil: 'networkidle0' }), or wait until the element in question has been added to the DOM using page.waitForSelector():
await page.goto('www.webpage.com' , {
waitUntil: 'networkidle0',
});
// ...
await page.waitForSelector('td');