I am using some simple code to implement e2e testes with jest-cucumber.
export const givenOpenUrl = (given) => {
given(/^I open "(.*)"$/, async (arg0) => {
await page.goto(`${arg0}`)
})
}
export const thenMatchPageTitle = (then) => {
then(/^I see "(.*)" in the title$/, async (arg0) => {
await expect(page.title()).resolves.toMatch(arg0)
})
}
Also this code fails as well :
describe('Nozzle AI', () => {
beforeAll(async () => {
await page.goto('https://nozzle.ai', {waitUntil: 'domcontentloaded'});
});
it('should be titled "Nozzle"', async () => {
await expect(page.title()).resolves.toMatch('Nozzle');
});
});
However, I get different behaviors based on the URL that is passed.
For example, the tests are passing if I use https://www.google.com/ and failing when I use https://www.nozzle.ai/ with the following error:
Navigation failed because the browser has disconnected!
It seems that the page.title().resolve has this error:
Expected an Error, but "" was thrown
I noticed that I am able to run this code with success :
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://nozzle.ai', {
waitUntil: 'networkidle2',
});
const title = await page.title()
console.log(title)
await browser.close();
})()
Any suggestions would be highly appreciated!
Related
I have 2 test examples and when I run them both at once it seems that there is no time to execute them both till end. When I run them separately, they go through till the end.
I've read that Puppeteer-cluster can help out running multiple tests at once but the process stops right after page.goto() function. I'm not sure if this is the right approach to my issue so feel free to offer other solutions that Puppeteer-cluster.
test1:
const { Cluster } = require('puppeteer-cluster');
const timeout = 100000
const { toMatchImageSnapshot } = require('jest-image-snapshot')
expect.extend({ toMatchImageSnapshot })
describe('login', () => {
test('test user login', async () => {
await page.goto(URL + 'login.jsp', { waitUntil: 'domcontentloaded' });
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: 2,
});
await cluster.task(async ({ page, data: url }) => {
await page.goto(URL + url, { waitUntil: 'domcontentloaded' });
await page.waitForSelector('input[name=username]')
await page.type('input[name=username]', username)
await page.type('input[name=password]', password)
const loginFormFilled = await page.screenshot();
expect(loginFormFilled).toMatchImageSnapshot({
failureThreshold: '0.01',
failureThresholdType: 'percent'
});
await page.waitForSelector('.roundButton').then(async () =>{
await page.evaluateHandle(() => {
let button = [...document.querySelectorAll('.roundButton')].find(element => element.textContent === 'Prijavi se');
if(button){
button.click();
}
});
})
await page.waitForSelector(".profilePic")
const image = await page.screenshot();
expect(image).toMatchImageSnapshot({
failureThreshold: '0.10',
failureThresholdType: 'percent'
});
});
cluster.queue('login.jsp');
await cluster.idle();
await cluster.close();
}, timeout);
});
The second test is almost the same just instead of login I'm testing registration process.
I've tried the same examples as here https://www.npmjs.com/package/puppeteer-cluster but the test stops right after page.goto and ends as passed test.
In near future ill have 30-40+ tests similar to test1 and I need to run them with one command instead of one by one.
Can someone help me understand why doesn't the product data get printed out? I'm currently using puppeteer to scrape a website for product data.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
//link to page that i want to scrape
await page.goto(
"link link",
{ waitUntil: "networkidle2" }
);
var data = await page
.evaluate(() => {
var productData = {};
productData["brand"] = document.querySelector(
"a.designer-name > span"
).textContent;
console.log("mimo");
return productData;
})
.catch(err => {
console.log(err);
});
console.log(data);
await browser.close();
})();
you are using promise and callback together. If you instead return a promise from the page.evaluate, it should work.
thanks to #tehhowch.
var data = await page
.evaluate(async () => {
return await new Promise(resolve => { // <-- return the data to node.js from browser
var productData = {};
productData["brand"] = document.querySelector(
"a.designer-name > span"
).textContent;
console.log("mimo");
resolve(productData);
});
})
.catch(err => {
console.log(err);
});
console.log(data);
I've written a script in node.js to scrape the links of different titles from a webpage. When I execute my following script, I get undefined printed in the console instead of the links I'm after. My defined selectors are accurate.
I do not wish to put the links in an array and return the results; rather, I wish to print them on the fly. As I'm very new to write scripts using node.js in combination with puppeteer, I can't figure out the mistake I'm making.
This is my script (Link to that site):
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping");
let url = await page.evaluate(() => {
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
//would like to keep the following line intact
console.log(item.getAttribute('href'));
});
})
browser.close();
return resolve(url);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
The following script works just fine if I consider to declare an empty array results and store the scraped links within it and finally return the resultsbut I do not wish to go like this. I would like to stick to the way I tried above, as in printing the result on the fly.
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping");
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
results.push({
url: item.getAttribute('href'),
});
});
return results;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Once again: my question is how can I print the link like console.log(item.getAttribute('href')); on the fly without storing it in an array?
To run console.log() inside evaluate() simply copy the line below where you are defining page
page.on('console', obj => console.log(obj._text));
so now the whole snippet will be like this now
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('console', obj => console.log(obj._text));
await page.goto("https://stackoverflow.com/questions/tagged/web-scraping");
let url = await page.evaluate(() => {
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
//would like to keep the following line intact
console.log(item.getAttribute('href'));
});
})
browser.close();
return resolve(url);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
Hope this help
The library looks a bit awkward to use but found the proper way to get an href from this thread on github- https://github.com/GoogleChrome/puppeteer/issues/628
The working code I have is to use await page.$$eval
async function getStackoverflowLinks(){
return new Promise(async(resolve, reject)=>{
console.log(`going to launch chromium via puppeteer`)
const browser = await puppeteer.launch()
console.log(`creating page/tab`)
const page = await browser.newPage()
await page.goto('https://stackoverflow.com/questions/tagged/web-scraping')
console.log("fetched SO web-scraping, now parsing link href")
let matches = await page.$$eval('a.question-hyperlink', hrefs=>hrefs.map((a)=>{
return a.href
})) // $$eval and map version, $$eval returns an array
console.log("matches = ", matches.length)
await browser.close()
resolve(matches)
})
}
getStackoverflowLinks()
.then(hrefs=>{
console.log("hrefs: ", hrefs)
})
Things to note,
async function will return a promise.
new Promise will also return a promise.
On that note, you can simply use the .console events to print them on fly. Usage,
page.on("console", msg => console.log(msg.text()));
await page.evaluate(async => {
console.log("I will be printed on node console too")
})
Advanced usage has been discussed on this answer.
I am using puppeteer to evaluate the javascript-based HTML of web pages in my test app.
This is the line I am using to make sure all the data is loaded:
await page.setRequestInterception(true);
page.on("request", (request) => {
if (request.resourceType() === "image" || request.resourceType() === "font" || request.resourceType() === "media") {
console.log("Request intercepted! ", request.url(), request.resourceType());
request.abort();
} else {
request.continue();
}
});
try {
await page.goto(url, { waitUntil: ['networkidle0', 'load'], timeout: requestCounterMaxWaitMs });
} catch (e) {
}
Is this the best way to wait for ajax requests to be completed?
It feels right but I'm not sure if I should use networkidle0, networkidle1, etc?
You can use pending-xhr-puppeteer, a lib that expose a promise awaiting that all the pending xhr requests are resolved.
Use it like this :
const puppeteer = require('puppeteer');
const { PendingXHR } = require('pending-xhr-puppeteer');
const browser = await puppeteer.launch({
headless: true,
args,
});
const page = await browser.newPage();
const pendingXHR = new PendingXHR(page);
await page.goto(`http://page-with-xhr`);
// Here all xhr requests are not finished
await pendingXHR.waitForAllXhrFinished();
// Here all xhr requests are finished
DISCLAIMER: I am the maintener of pending-xhr-puppeteer
XHR by their nature can appear later in the app. Any networkidle0 will not help you if app sends XHR after for example 1 second and you want to wait for it. I think if you want to do this "properly" you should know what requests you are waiting for and await for them.
Here is an example with XHRs occurred later in the app and it wait for all of them:
const puppeteer = require('puppeteer');
const html = `
<html>
<body>
<script>
setTimeout(() => {
fetch('https://swapi.co/api/people/1/');
}, 1000);
setTimeout(() => {
fetch('https://www.metaweather.com/api/location/search/?query=san');
}, 2000);
setTimeout(() => {
fetch('https://api.fda.gov/drug/event.json?limit=1');
}, 3000);
</script>
</body>
</html>`;
// you can listen to part of the request
// in this example I'm waiting for all of them
const requests = [
'https://swapi.co/api/people/1/',
'https://www.metaweather.com/api/location/search/?query=san',
'https://api.fda.gov/drug/event.json?limit=1'
];
const waitForRequests = (page, names) => {
const requestsList = [...names];
return new Promise(resolve =>
page.on('request', request => {
if (request.resourceType() === "xhr") {
// check if request is in observed list
const index = requestsList.indexOf(request.url());
if (index > -1) {
requestsList.splice(index, 1);
}
// if all request are fulfilled
if (!requestsList.length) {
resolve();
}
}
request.continue();
})
);
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
// register page.on('request') observables
const observedRequests = waitForRequests(page, requests);
// await is ignored here because you want to only consider XHR (ajax)
// but it's not necessary
page.goto(`data:text/html,${html}`);
console.log('before xhr');
// await for all observed requests
await observedRequests;
console.log('after all xhr');
await browser.close();
})();
I agree with the sentiment in this answer that waiting for all network activity to cease ("all the data is loaded") is a rather ambiguous concept that is entirely dependent on the behavior of the website you're scraping.
Options for detecting responses include waiting a fixed duration, a fixed duration after network traffic idles, for a specific response (or set of responses), for an element to appear on the page, for a predicate to return true, etc, all of which Puppeteer supports.
With this in mind, the most typical scenario is that you're waiting for some particular response or set of responses from known (or partially-known, using some pattern or prefix) resource URL(s) that will deliver a payload you want to read and/or trigger a DOM interaction you need to detect. Puppeteer offers page.waitForResponse for doing just this.
Here's an example, building on an existing answer (and showing how to retrieve the data from the responses while we're at it):
const puppeteer = require("puppeteer");
const html = `
<html>
<body>
<script>
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/1");
}, 1000);
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/2");
}, 2000);
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/3");
}, 3000);
setTimeout(() => {
// fetch something irrelevant to us
fetch("http://jsonplaceholder.typicode.com/users/4");
}, 0);
</script>
</body>
</html>`;
(async () => {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setContent(html);
const expectedUrls = [
"http://jsonplaceholder.typicode.com/users/1",
"http://jsonplaceholder.typicode.com/users/2",
"http://jsonplaceholder.typicode.com/users/3",
];
try {
const responses = await Promise.all(expectedUrls.map(url =>
page.waitForResponse(
response => response.url() === url,
{timeout: 5000}
)
));
const data = await Promise.all(
responses.map(response => response.json())
);
console.log(data);
}
catch (err) {
console.error(err);
}
await browser.close();
})()
Hi I am trying to make a screenshot service
const puppeteer = require('puppeteer');
var resWidth = 1366;
var resHeight = 1000;
var browser;
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
and when I receive a work I try to do
data.forEach(function(d){
try {
console.log(d["url"]);
(async () => {
var page = await browser.newPage();
await page.setViewport({width: resWidth, height: resHeight});
await page.goto(d["url"], {timeout: 90000, waitUntil: 'networkidle'});
await page.screenshot({path: './picdata/' + d['id'] + '.png' ,fullPage: true});
await page.close();
})();
} catch(e) {}
});
but I can't... here is the error:
(node:40596) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 7): TypeError: Cannot read property 'newPage' of undefined
I don't want to open a new browser for each screenshot launching browser takes time and requires more memory?
what should I do?
The problem:
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
This code never gets executed. Why? because it's not a true closure.
More on closures, here.
That being said, that wont have work for your given scenario, as they are async tasks.
My try with your example:
'use strict';
const puppeteer = require('puppeteer');
const resWidth = 1366;
const resHeight = 1000;
let browser;
let page;
async function launchBrowser() {
browser = await puppeteer.launch({ headless: true }); //this "{ headless: true }" will ensure a browser window is not open a single time.
};
launchBrowser().then((x) => { // wait until browser has launched
data.forEach(async (d) => {
try {
page = await browser.newPage(); // the document is not very clear about this method, so I am still unsure if I could move this in the above launchBrowser() method.
await page.setViewport({ width: resWidth, height: resHeight });
await page.goto(d['url'], { timeout: 90000, waitUntil: 'networkidle' });
await page.screenshot({ path: './picdata/' + d['id'] + '.png', fullPage: true });
}
catch (e) {
console.log(e);
await browser.close(); // close browser if there is an error
}
});
})
.then(() => {
await browser.close(); // close browser finally.
});
Pro Tip: Start using let, const instead of var.
There is a great article on this, here