Console.log page content before exit - node.js

Sometimes my puppeteer program hangs forever because of an unhandled promise, so I figured I start the program with a setTimeout that closes it.
var page;
var content;
const maxExecTime = 1000*60*5; // 5 min
setTimeout(function(){
content = page.content();
console.log('TIMEOUT');
console.log(content);
process.exit()
}, maxExecTime);
(async () => {
const browser = await puppeteer.launch();
page = await browser.newPage();
await page.goto('URL');
...
await browser.close();
})();
My goal would be to console.log the last content of the page before the program closes, but the setTimeout is outside the async function so I can't use await, and I got:
Promise { <pending> }
How can I achive my goal? Thanks.

You forgot to await for the timeout function, each function requires it's own async keyword.
var page;
var content;
const maxExecTime = 1000*60*5; // 5 min
setTimeout(await function(){
content = await page.content();
console.log('TIMEOUT');
console.log(content);
process.exit()
}, maxExecTime);
(async () => {
const browser = await puppeteer.launch();
page = await browser.newPage();
await page.goto('URL');
...
await browser.close();
})();

Related

How to get the quantity of children inside the element using xpath query on the puppeter?

I need to get the number of children inside an element with the page.$x(xpath) function in Puppeteer, mad I was not successful. In the browser console I can do it using $x().children.length. What would be the best way to do this in the puppeteer with node?
You can use either element/JS handle API or pure Web API in page.evaluate(). These are both wways:
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
const [element] = await page.$x('//body/div');
const children = await element.getProperty('children');
const length1 = await (await children.getProperty('length')).jsonValue();
console.log(length1); // 3
const length2 = await page.evaluate(() => {
return document.evaluate(
'//body/div', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE
).singleNodeValue.children.length;
});
console.log(length2); // 3
await browser.close();
} catch (err) {
console.error(err);
}
})();

Puppeteer - How to wait img after change innerHTML?

I would like to add an image before a screenshot with puppeteer.
The following code works but instead of waiting like this, I would like to wait until the img is here :
element.innerHTML = "<img id=\"logo_website\" src=\"http://random.com/logo.jpg\">";
await page.waitFor(2000)
I tried with the following "waitFor" but it doesn't work.
await page.waitFor("#logo_website")
You can try page.waitForResponse() in this way:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
await Promise.all([
page.waitForResponse('https://www.iana.org/_img/2013.1/iana-logo-header.svg'),
page.evaluate(() => {
document.body.innerHTML = '<img id="logo_website" src="https://www.iana.org/_img/2013.1/iana-logo-header.svg">';
}),
]);
await page.screenshot({ path: 'scr.png' });
await browser.close();
} catch (err) {
console.error(err);
}
})();

Ensure element is exist in the page, to ensure is logged in or any other action

I'm using puppeteer, node js I'm writing a script to log in it successfully logged, but I need to write code to ensure it successfully logged in by ensure some element is present, I need to write if element is present focused on it or if it not present print "element is not present".
This the script I've written:
const puppeteer = require('puppeteer');
async function log_in() {
const browser = await puppeteer.launch({
headless: false,
args: ['--Window-size=1929,1170', '--Window-position=0,0']
});
const page = await browser.newPage();
await page.setViewport({ 'width': 1366, 'height': 768 });
await page.goto(URL, { waitUntil: 'networkidle2' });
await page.click('.fancybox-item');
await delay(1000);
// fun for waiting
function delay(time) {
return new Promise(function (resolve) {
setTimeout(resolve, time)
});
}
const UserName = 'xxxxxxxxxx';
const Password = '222222';
page.click('selector');
await delay(1000);
//Focus on user name
await page.focus('selector');
await delay(2000);
await page.type('selector', UserName);
await delay(2000);
//Focus on password
await page.focus('selector');
await page.type('selector'', Password);
// Clicking the link will indirectly cause a navigation
page.click('selector');
await delay(5000);
if (await page.waitForSelector('selector')) {
console.log("found")
} else console.log("not found");
await delay(5000);
} log_in();
Note: the code is work successfully but it doesn't print anything in console output.
You can use page.$(selector). It returns an element or null if no matches the selector. And you need to wait for navigation after you click on submit button using page.waitForNavigation([options]) probably.
...
// Clicking the link will indirectly cause a navigation
await page.click('selector')
await page.waitForNavigation()
const element = await page.$('selector')
if (element) {
console.log ('found')
} else {
console.log ('not found')
}
...
First of all, I'm afraid you have lacks experience in working with asynchronous Javascript. It's crucial to understand this concept since most of the functions of Puppeteer is async. The same other libraries or tools like Selenium.
Try to learn more about that https://developer.mozilla.org/en-US/docs/Learn/JavaScript/Asynchronous
Answering on your problem: I think it'd be to use waitForSelector() instead of document.querySelector (page.$) as #drobnikj suggested. First of all waitForSelector() waits for the element so you can handle elements that might load asynchronously. It might not be the case on your page but in general, it's a better approach I believe. All that you need to do is to handle a potential exception.
My proposition is:
const puppeteer = require('puppeteer');
const browser = await puppeteer.launch({
headless: false,
args: ['--Window-size=1929,1170', '--Window-position=0,0']
});
const page = await browser.newPage()[0];
async function logIn(loginPageUrl, userName, password) {
const fancyBoxItemSelector = '.fancybox-item';
const loginInputSelector = '';
const passwordInputSelector = '';
const loginRedirectionLink = '';
const expectedSelectorAfterLogin = '';
await page.goto(URL, { waitUntil: 'networkidle2' });
await page.waitForSelector(fancyBoxItemSelector);
await page.click(fancyBoxItemSelector);
await page.waitForSelector(loginInputSelector);
await page.click(loginInputSelector);
await page.type(loginInputSelector, userName);
await page.waitForSelector(passwordInputSelector);
await page.click(passwordInputSelector);
await page.type(passwordInputSelector, password);
await page.waitForSelector(loginRedirectionLink);
const response = page.waitForNavitation({ waitUntil: 'networkidle2' });
await page.click(loginRedirectionLink);
await response;
try {
await page.waitForSelector(expectedSelectorAfterLogin);
console.log('Selector found!');
} catch (err) {
console.log('Selector not found!');
}
}
(async () => {
await logIn('', '', '')
})()
I allowed myself to polish your code. I hope it's clear what I have done here.

Is there a way to get puppeteer's waitUntil "networkidle" to only consider XHR (ajax) requests?

I am using puppeteer to evaluate the javascript-based HTML of web pages in my test app.
This is the line I am using to make sure all the data is loaded:
await page.setRequestInterception(true);
page.on("request", (request) => {
if (request.resourceType() === "image" || request.resourceType() === "font" || request.resourceType() === "media") {
console.log("Request intercepted! ", request.url(), request.resourceType());
request.abort();
} else {
request.continue();
}
});
try {
await page.goto(url, { waitUntil: ['networkidle0', 'load'], timeout: requestCounterMaxWaitMs });
} catch (e) {
}
Is this the best way to wait for ajax requests to be completed?
It feels right but I'm not sure if I should use networkidle0, networkidle1, etc?
You can use pending-xhr-puppeteer, a lib that expose a promise awaiting that all the pending xhr requests are resolved.
Use it like this :
const puppeteer = require('puppeteer');
const { PendingXHR } = require('pending-xhr-puppeteer');
const browser = await puppeteer.launch({
headless: true,
args,
});
const page = await browser.newPage();
const pendingXHR = new PendingXHR(page);
await page.goto(`http://page-with-xhr`);
// Here all xhr requests are not finished
await pendingXHR.waitForAllXhrFinished();
// Here all xhr requests are finished
DISCLAIMER: I am the maintener of pending-xhr-puppeteer
XHR by their nature can appear later in the app. Any networkidle0 will not help you if app sends XHR after for example 1 second and you want to wait for it. I think if you want to do this "properly" you should know what requests you are waiting for and await for them.
Here is an example with XHRs occurred later in the app and it wait for all of them:
const puppeteer = require('puppeteer');
const html = `
<html>
<body>
<script>
setTimeout(() => {
fetch('https://swapi.co/api/people/1/');
}, 1000);
setTimeout(() => {
fetch('https://www.metaweather.com/api/location/search/?query=san');
}, 2000);
setTimeout(() => {
fetch('https://api.fda.gov/drug/event.json?limit=1');
}, 3000);
</script>
</body>
</html>`;
// you can listen to part of the request
// in this example I'm waiting for all of them
const requests = [
'https://swapi.co/api/people/1/',
'https://www.metaweather.com/api/location/search/?query=san',
'https://api.fda.gov/drug/event.json?limit=1'
];
const waitForRequests = (page, names) => {
const requestsList = [...names];
return new Promise(resolve =>
page.on('request', request => {
if (request.resourceType() === "xhr") {
// check if request is in observed list
const index = requestsList.indexOf(request.url());
if (index > -1) {
requestsList.splice(index, 1);
}
// if all request are fulfilled
if (!requestsList.length) {
resolve();
}
}
request.continue();
})
);
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
// register page.on('request') observables
const observedRequests = waitForRequests(page, requests);
// await is ignored here because you want to only consider XHR (ajax)
// but it's not necessary
page.goto(`data:text/html,${html}`);
console.log('before xhr');
// await for all observed requests
await observedRequests;
console.log('after all xhr');
await browser.close();
})();
I agree with the sentiment in this answer that waiting for all network activity to cease ("all the data is loaded") is a rather ambiguous concept that is entirely dependent on the behavior of the website you're scraping.
Options for detecting responses include waiting a fixed duration, a fixed duration after network traffic idles, for a specific response (or set of responses), for an element to appear on the page, for a predicate to return true, etc, all of which Puppeteer supports.
With this in mind, the most typical scenario is that you're waiting for some particular response or set of responses from known (or partially-known, using some pattern or prefix) resource URL(s) that will deliver a payload you want to read and/or trigger a DOM interaction you need to detect. Puppeteer offers page.waitForResponse for doing just this.
Here's an example, building on an existing answer (and showing how to retrieve the data from the responses while we're at it):
const puppeteer = require("puppeteer");
const html = `
<html>
<body>
<script>
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/1");
}, 1000);
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/2");
}, 2000);
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/3");
}, 3000);
setTimeout(() => {
// fetch something irrelevant to us
fetch("http://jsonplaceholder.typicode.com/users/4");
}, 0);
</script>
</body>
</html>`;
(async () => {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setContent(html);
const expectedUrls = [
"http://jsonplaceholder.typicode.com/users/1",
"http://jsonplaceholder.typicode.com/users/2",
"http://jsonplaceholder.typicode.com/users/3",
];
try {
const responses = await Promise.all(expectedUrls.map(url =>
page.waitForResponse(
response => response.url() === url,
{timeout: 5000}
)
));
const data = await Promise.all(
responses.map(response => response.json())
);
console.log(data);
}
catch (err) {
console.error(err);
}
await browser.close();
})()

use same browser instance?

Hi I am trying to make a screenshot service
const puppeteer = require('puppeteer');
var resWidth = 1366;
var resHeight = 1000;
var browser;
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
and when I receive a work I try to do
data.forEach(function(d){
try {
console.log(d["url"]);
(async () => {
var page = await browser.newPage();
await page.setViewport({width: resWidth, height: resHeight});
await page.goto(d["url"], {timeout: 90000, waitUntil: 'networkidle'});
await page.screenshot({path: './picdata/' + d['id'] + '.png' ,fullPage: true});
await page.close();
})();
} catch(e) {}
});
but I can't... here is the error:
(node:40596) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 7): TypeError: Cannot read property 'newPage' of undefined
I don't want to open a new browser for each screenshot launching browser takes time and requires more memory?
what should I do?
The problem:
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
This code never gets executed. Why? because it's not a true closure.
More on closures, here.
That being said, that wont have work for your given scenario, as they are async tasks.
My try with your example:
'use strict';
const puppeteer = require('puppeteer');
const resWidth = 1366;
const resHeight = 1000;
let browser;
let page;
async function launchBrowser() {
browser = await puppeteer.launch({ headless: true }); //this "{ headless: true }" will ensure a browser window is not open a single time.
};
launchBrowser().then((x) => { // wait until browser has launched
data.forEach(async (d) => {
try {
page = await browser.newPage(); // the document is not very clear about this method, so I am still unsure if I could move this in the above launchBrowser() method.
await page.setViewport({ width: resWidth, height: resHeight });
await page.goto(d['url'], { timeout: 90000, waitUntil: 'networkidle' });
await page.screenshot({ path: './picdata/' + d['id'] + '.png', fullPage: true });
}
catch (e) {
console.log(e);
await browser.close(); // close browser if there is an error
}
});
})
.then(() => {
await browser.close(); // close browser finally.
});
Pro Tip: Start using let, const instead of var.
There is a great article on this, here

Resources