My task is form submission with different data. So, I am using puppeteer and for of loop.
Code example:
const puppeteer = require('puppeteer')
const data = require('data.json') // ~30 products
(async () => {
try {
const browser = await puppeteer.launch({
headless: false,
ignoreHTTPSErrors: true,
defaultViewport: null,
});
const page2 = await browser.newPage();
await page2.goTo('mywebsite', {waitUntil: 'domcontentloaded'} )
for (let product of data) {
// Waiting for some selector, after that do something with it
await page2.waitForSelector("#someSelector", { visible: true });
await page2.type("#someSelector", product.someData);
//
//... A lot of code that similar to above is here
//
// Go back after all things done
await page2.waitFor(2000);
await page2.waitForSelector('[title="home"]', { visible: true });
await page2.click('[title="home"]', { clickCount: 1 });
counter++;
console.log(
`===========================================================================${counter}`
);
}
} catch (err) {
throw new Error(err);
}
})();
The problem is that this is works, but not always works, for example, a loop can work 15 times, and then fall off or go through a full cycle without failing.
The error is always the same:
UnhandledPromiseRejectionWarning: Error: TimeoutError: waiting for selector "#someSelector" failed: timeout 30000ms exceeded
However, if I check the page, then everything is there, the elements are on the page, but puppeteer does not see them. How Can I fix this?
My current solution for this is a retry function:
const chalk = require("chalk");
const util = require("util");
const delay = util.promisify(setTimeout);
async function retry(fn, retryDelay = 200, numRetries = 5) {
for (let i = 0; i < numRetries; i++) {
try {
return await fn();
} catch (err) {
console.log(
"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
);
console.log(chalk.yellow(err));
if (i === numRetries - 1) throw err;
await delay(retryDelay);
retryDelay = retryDelay * 2;
}
}
}
Related
So I have the following code launching
this.browser = await puppeteer.launch( { headless: false, devtools: true, slowMo: 200});
this.page = await this.browser.newPage();
await this.page.goto(pageUrl);
let result = await this.page.evaluate(() => {
const labels = document.querySelectorAll("li.product-item");
let productList = [];
for(let product of labels) {
productList.push(product);
}
debugger;
//filter only product stacks that have a price
const productStacks = productList.filter(product => product.querySelector("span.price-wrapper") !== null);
let results = productStacks.map(product => {
return product.querySelector("span.price-wrapper").getAttribute("data-price-amount");
});
return results;
});
So chromium starts up and pauses at the appropriate code (as best as I can tell), I can even see the local variables populate with the expect result and step through the code, however the open file puppeteer_evaluation_script is not populated with the evaluation script and remains with the contents, so I'm stepping through blind.
//# sourceURL=__puppeteer_evaluation_script__
Occasionally after many minutes it sometimes does actually populate with the code. I have no idea what's wrong, I've tried updating the latest node lts and puppeteer but have the same behavior.
I don't know what causes this issue, but here is a pair of possible solutions.
To avoid getting:
//# sourceURL=__puppeteer_evaluation_script__
You can expose a function:
const puppeteer = require('puppeteer');
var browser = null;
var page = null;
(async () =>
{
browser = await puppeteer.launch(
{
headless: false,
devtools: true,
});
page = await browser.newPage();
await page.goto("https://google.com/");
// Expose a function
page.exposeFunction("nothing", () => null);
await page.evaluate(async function()
{
debugger;
console.log("Do task");
});
})();
In case that fails in the future. I made a wrapper for using eval() since the source appears using that.
It works with both sync and async functions and supports passing args and returning values.
function evaluateFixed(page, realFunction)
{
return page.evaluate(async function(realFunction, args)
{
var func = eval(`(${realFunction})`);
if(func.constructor.name === "AsyncFunction")
return await func(...args);
else
return func(...args);
},
realFunction.toString(), Array.from(arguments).slice(2));
}
(async () =>
{
browser = await puppeteer.launch(
{
headless: false,
devtools: true,
});
page = await browser.newPage();
await page.goto("https://google.com/");
console.log("Doing test");
let res = await evaluateFixed(page, async function(x, y, z)
{
debugger;
console.log("Do task", x, y, z);
function sleep(amount)
{
return new Promise((resolve) => setTimeout(resolve, amount));
}
for(let i = 0; i < 10; i++)
{
console.log("on seconds", i);
await sleep(1000);
}
return { "fee": "foo" };
}, 1, "two", { "three": 3});
console.log("Res 1", res);
res = await evaluateFixed(page, () =>
{
debugger;
return 1 + 2;
});
console.log("Res 2", res);
})();
I was experiencing the same problem.
I was thinking it could be the timeout to open external files was not sufficient.
So i have added the parameter 'slowMo: 1000' and it solved for me.
Good Luck.
For some reason my script will return once it reaches the "ticker" setinterval loop instead of doing whats inside of the loop and then returning.
Why is it returning early instead of doing the setinerval?
The javascript code
const puppeteer = require('puppeteer');
async function test(url3) {
let counter = 0;
try {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null
});
const page = await browser.newPage();
const url = 'https://yandex.com/images/';
await page.goto(url);
await page.evaluate(() => {
document.querySelector('.input__cbir-button').firstElementChild.click();
})
await page.focus('input[name="cbir-url"]')
await page.keyboard.type(url3)
await page.keyboard.press('Enter');
page
.on('response', response =>{
//console.log(`${response.url()}`)
if(`${response.url()}` == "https://yandex.com/clck/jclck/"){
counter++;
}
})
SetInterval function where it messes up
var ticker = setInterval(async () => {
if(counter === 2){
clearInterval(ticker)
if(page.$('.cbir-no-results__content' !== null)){
await browser.close();
return('no images found');
} else {
await page.screenshot({path: 'test.png', fullPage: true});
await browser.close();
}
}
})
} catch(err) {
//console.log(err)
}
}
Return
test("animage.jpg").then(ans => {
console.log(ans)
})
setInterval will always return a number: the id of the timer that you can then pass to clearInterval. What you likely want is to wrap the entire thing in a Promise which it then resolved when your condition is met:
// no need for async, we're explicitly returning a Promise
function test() {
return new Promise(resolve => {
const handle = setInterval(() => {
if (conditionMet) {
clearInterval(handle);
resolve();
}
}, 100); // or however often
});
}
This is one of the few times where it is correct to explicitly construct a Promise.
#!/usr/bin/env node
// vim: set noexpandtab tabstop=2:
const puppeteer = require('puppeteer');
const fs = require('fs').promises;
const cookies_json_file = process.argv[2];
const url = process.argv[3];
const timeout = parseInt(process.argv[4], 10);
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const cookiesString = await fs.readFile(cookies_json_file);
const cookies = JSON.parse(cookiesString);
await page.setCookie.apply(page, cookies);
try {
await page.goto(url, { waitUntil: 'networkidle2', timeout: timeout });
const content = await page.content();
page.on('response', async response => {
if(response.url().startsWith('https://www.genecards.org/gene/api/data/Enhancers?geneSymbol=')) {
response.buffer().then(function(data) {
fs.writeFile('/dev/stdout', data);
});
}
});
const linkHandlers = await page.$x('//div[#data-ga-category = "GeneHancer"]//a[#data-role = "show-all"]');
if (linkHandlers.length > 0) {
await Promise.all([
linkHandlers[0].click()
, page.waitForNavigation({waitUntil: 'networkidle2', timeout: timeout})
]);
} else {
throw new Error("Link not found");
}
} catch (e) {
console.error(e);
process.exit(1);
} finally {
await browser.close();
}
})();
I have the above main.js.
$ ./main.js cookies.json 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=BSCL2' 30000
When I run using the above command, I got this error. Does anybody know how to fix the error? Thanks.
TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
at /usr/local/lib/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/usr/local/lib/node_modules/puppeteer/lib/helper.js:110:27)
at Page.waitForNavigation (/usr/local/lib/node_modules/puppeteer/lib/Page.js:649:49)
at Page.<anonymous> (/usr/local/lib/node_modules/puppeteer/lib/helper.js:111:23)
at main.js:33:12
at processTicksAndRejections (internal/process/task_queues.js:89:5) {
name: 'TimeoutError'
}
```
You will need to disable timeout by setting the timeout parameter to 0. The default is 30 seconds (which you are also passing as a command line argument from your example), so this is behaving as expected and throwing an exception because the timeout exceeded the default or user supplied value of 30000 milliseconds.
page.waitForNavigation({waitUntil: 'networkidle2', timeout: 0})
You can also pass the parameter from the command line as an argument, which is preferable so as not to hard-code the value:
$ ./main.js cookies.json 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=BSCL2' 0
Using puppeteer to collect data from 2 different webpages into arrays for later comparison. However the program does not wait for the returned array before carrying forward.
async function go(){
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++){
const td = tds[i];
const tdcontent = await page.evaluate(td => td.innerText, td);
if (tdcontent.length > 5) {
data[i] = {"content": tdcontent};
}
}
return data;
} catch (e) {
console.log(e);
}
};
(async function main(){
const returnedData = await go();
console.log(returnedData.length);
})();
The return data.length is 0. New to nodejs, and async programming structure. I think it is because the .length is logged before the data is returned?
how do I return the data in a way where can manipulate it and complete my comparisons?
I try to not use page.$$ in such cases. Instead I use document.querySelectorAll and map thru the elements and extract the text.
Here is the modified code:
const getTdData = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("http://example.com");
return page.evaluate(() => {
// get all td elements
const tdList = [...document.querySelectorAll("td")];
return tdList.map(element => ({ content: element.textContent }));
});
} catch (e) {
console.log(e);
}
};
(async function main() {
const returnedData = await getTdData();
console.log(returnedData.length);
})();
First of all, you are missing an apostrophe in your page.$$() function. You should change this to:
const tds = await page.$$('td');
Next, you are trying to pass a non-existent variable to page.evaluate(). You can fix this by passing tds[i] instead of td:
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
Your final result should look something like this:
const go = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++) {
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
if (tdcontent.length > 5) {
data[i] = {
content: tdcontent,
};
}
}
return data;
} catch (error) {
console.log(error);
}
};
(async function main() {
const returnedData = await go();
console.log(returnedData.length);
})();
If you are are still experiencing issues, you may want to wait until the page has loaded completely using page.goto( ... , { waitUntil: 'networkidle0' }), or wait until the element in question has been added to the DOM using page.waitForSelector():
await page.goto('www.webpage.com' , {
waitUntil: 'networkidle0',
});
// ...
await page.waitForSelector('td');
Hi I am trying to make a screenshot service
const puppeteer = require('puppeteer');
var resWidth = 1366;
var resHeight = 1000;
var browser;
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
and when I receive a work I try to do
data.forEach(function(d){
try {
console.log(d["url"]);
(async () => {
var page = await browser.newPage();
await page.setViewport({width: resWidth, height: resHeight});
await page.goto(d["url"], {timeout: 90000, waitUntil: 'networkidle'});
await page.screenshot({path: './picdata/' + d['id'] + '.png' ,fullPage: true});
await page.close();
})();
} catch(e) {}
});
but I can't... here is the error:
(node:40596) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 7): TypeError: Cannot read property 'newPage' of undefined
I don't want to open a new browser for each screenshot launching browser takes time and requires more memory?
what should I do?
The problem:
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
This code never gets executed. Why? because it's not a true closure.
More on closures, here.
That being said, that wont have work for your given scenario, as they are async tasks.
My try with your example:
'use strict';
const puppeteer = require('puppeteer');
const resWidth = 1366;
const resHeight = 1000;
let browser;
let page;
async function launchBrowser() {
browser = await puppeteer.launch({ headless: true }); //this "{ headless: true }" will ensure a browser window is not open a single time.
};
launchBrowser().then((x) => { // wait until browser has launched
data.forEach(async (d) => {
try {
page = await browser.newPage(); // the document is not very clear about this method, so I am still unsure if I could move this in the above launchBrowser() method.
await page.setViewport({ width: resWidth, height: resHeight });
await page.goto(d['url'], { timeout: 90000, waitUntil: 'networkidle' });
await page.screenshot({ path: './picdata/' + d['id'] + '.png', fullPage: true });
}
catch (e) {
console.log(e);
await browser.close(); // close browser if there is an error
}
});
})
.then(() => {
await browser.close(); // close browser finally.
});
Pro Tip: Start using let, const instead of var.
There is a great article on this, here