Puppeteer chrome get active/visible tab - node.js

In a chrome extension you can use below to find the active tab in a window
chrome.tabs.query({
currentWindow: true,
active: true,
}
I have a below code which connects to existing browser and get all the pages. I am not able to make out if there is a way for me to know which tab/page is currently the active one and get its url (page.url(), but which one from the the array to use?)
const puppeteer = require('puppeteer');
debuggerUrl = "http://127.0.0.1:9999/json/version"
const request = require('request');
request(debuggerUrl, function (error, response, body) {
data = JSON.parse(body);
webSocketDebuggerUrl = data["webSocketDebuggerUrl"];
console.log("Connecting to ", webSocketDebuggerUrl);
puppeteer.connect({browserWSEndpoint: webSocketDebuggerUrl}).then(async browser => {
var pages = await browser.pages();
console.log(pages);
console.log(await browser.targets())
await browser.disconnect();
})
});

document.hidden is now deprecated. But we can use document.visibilityState
note that page will always refer to the same tab even if you have change to different tab. So you have to change page to the active tab manually.
const pages = await browser.pages();
// this will return list of active tab (which is pages object in puppeteer)
const visiblePages = pages.filter(async (p) => {
const state = await p.evaluate(() => document.visibilityState);
return state === 'visible';
});
const activeTab = visiblePages[0]; // since there should be only 1 active tab per window

Using document.hidden
const pages = await browser.pages()
let page
for (let i = 0; i < pages.length && !page; i++) {
const isHidden = await pages[i].evaluate(() => document.hidden)
if (!isHidden) {
page = pages[i]
}
}

const pages = await browser.pages();
const vis_results = await Promise.all(pages.map(async (p) => {
const state = await p.evaluate(() => document.webkitHidden);
return !state;
}));
let visiblePage = pages.filter((_v, index) => vis_results[index])[0];

Related

QuerySelector's returning an empty selection in console while on browser side it contains elements

In order to learn web scraping with puppeteer , i have started a little project , which aims to extract the planning of Power Outages from the National Power Supplier's website. In order to do that i have to manually change the region then retrieve the Outage's program list. The QuerySelector request i use browser side looks totally fine as it contains without fault all the outages displayed . But when i use it on the server end i receive an empty list.
Here is my code and the url of the website can be found in it .
Thanks in advance !
const puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://alert.eneo.cm/', { waitUntil: 'networkidle0' });
await page.evaluate(() => {
var region = "Littoral";
var j = $('#regions option:contains(' + region + ')');
$('#regions').val(j.val()).change();
});
const outages = await page.evaluate(() => {
const elements = document.querySelectorAll("#contentdata .outage");
return elements;
});
console.log(outages);
})();
I see there is list of power outages on the page you want scrape. Here is how you can get the power outage data for the first div
(async()=>{
let browser = await puppeteer.launch();
let page = await browser.newPage();
await page.goto('https://alert.eneo.cm/', { waitUntil: 'networkidle0' });
await page.select('select[name="regions"]', '5')
const outageData = await page.evaluate( async () => {
let quartier = document.querySelector('div[class="quartier"]').innerText;
let ville = document.querySelector('div[class="ville"]').innerText;
let observations = document.querySelector('div[class="observations"]').innerText;
let dateAndTime= document.querySelector('div[class="prog_date"]').innerText;
return {quartier, ville, observations, dateAndTime}
});
await browser.close();
console.log(outageData);
})();

How to download pdf file that opens in new tab with puppeteer?

I am trying to download invoice from website using puppeteer, I just started to learn puppeteer. I am using node to create and execute the code. I have managed to login and navigate to the invoice page, but it opens in new tab, so, code is not detecting it since its not the active tab. This is the code I used:
const puppeteer = require('puppeteer')
const SECRET_EMAIL = 'emailid'
const SECRET_PASSWORD = 'password'
const main = async () => {
const browser = await puppeteer.launch({
headless: false,
})
const page = await browser.newPage()
await page.goto('https://my.apify.com/sign-in', { waitUntil: 'networkidle2' })
await page.waitForSelector('div.sign_shared__SignForm-sc-1jf30gt-2.kFKpB')
await page.type('input#email', SECRET_EMAIL)
await page.type('input#password', SECRET_PASSWORD)
await page.click('input[type="submit"]')
await page.waitForSelector('#logged-user')
await page.goto('https://my.apify.com/billing#/invoices', { waitUntil: 'networkidle2' })
await page.waitForSelector('#reactive-table-1')
await page.click('#reactive-table-1 > tbody > tr:nth-child(1) > td.number > a')
const newPagePromise = new Promise(x => browser.once('targetcreated', target => x(target.page())))
const page2 = await newPagePromise
await page2.bringToFront()
await page2.screenshot({ path: 'apify1.png' })
//await browser.close()
}
main()
In the above code I am just trying to take screenshot. Can anyone help me?
Here is an example of a work-around for the chromium issue mentioned in the comments above. Adapt to fit your specific needs and use-case. Basically, you need to capture the new page (target) and then do whatever you need to do to download the file, possibly pass it as a buffer to Node as per the example below if no other means work for you (including a direct request to the download location via fetch or ideally some request library on the back-end)
const [PDF_page] = await Promise.all([
browser
.waitForTarget(target => target.url().includes('my.apify.com/account/invoices/' && target).then(target => target.page()),
ATT_page.click('#reactive-table-1 > tbody > tr:nth-child(1) > td.number > a'),
]);
const asyncRes = PDF_page.waitForResponse(response =>
response
.request()
.url()
.includes('my.apify.com/account/invoices'));
await PDF_page.reload();
const res = await asyncRes;
const url = res.url();
const headers = res.headers();
if (!headers['content-type'].includes('application/pdf')) {
await PDF_page.close();
return null;
}
const options = {
// target request options
};
const pdfAb = await PDF_page.evaluate(
async (url, options) => {
function bufferToBase64(buffer) {
return btoa(
new Uint8Array(buffer).reduce((data, byte) => {
return data + String.fromCharCode(byte);
}, ''),
);
}
return await fetch(url, options)
.then(response => response.arrayBuffer())
.then(arrayBuffer => bufferToBase64(arrayBuffer));
},
url,
options,
);
const pdf = Buffer.from(pdfAb, 'base64');
await PDF_page.close();

Why am I not able to navigate through iFrames using Apify/Puppeteer?

I'm trying to manipulate forms of sites w/ iFrames in it using Puppeteer. I tried different ways to reach a specific iFrame, or even to count iFrames in a website, with no success.
Why isn't Puppeteer's object recognizing the iFrames / child frames of the page I'm trying to navigate through?
It's happening with other pages as well, such as https://www.veiculos.itau.com.br/simulacao
const Apify = require('apify');
const sleep = require('sleep-promise');
Apify.main(async () => {
// Launch the web browser.
const browser = await Apify.launchPuppeteer();
// Create and navigate new page
console.log('Open target page');
const page = await browser.newPage();
await page.goto('https://www.credlineitau.com.br/');
await sleep(15 * 1000);
for (const frame in page.mainFrame().childFrames()) {
console.log('test');
}
await browser.close();
});
Perhaps you'll find some helpful inspiration below.
const waitForIframeContent = async (page, frameSelector, contentSelector) => {
await page.waitForFunction((frameSelector, contentSelector) => {
const frame = document.querySelector(frameSelector);
const node = frame.contentDocument.querySelector(contentSelector);
return node && node.innerText;
}, {
timeout: TIMEOUTS.ten,
}, frameSelector, contentSelector);
};
const $frame = await waitForSelector(page, SELECTORS.frame.iframeNode).catch(() => null);
if ($frame) {
const frame = page.frames().find(frame => frame.name() === 'content-iframe');
const $cancelStatus = await waitForSelector(frame, SELECTORS.frame.membership.cancelStatus).catch(() => null);
await waitForIframeContent(page, SELECTORS.frame.iframeNode, SELECTORS.frame.membership.cancelStatus);
}
Give it a shot.

Controlling margins when making a PDF using Playwright

When making a PDF from a headless chrome Playwright session (code below) I get the PDF on the left, whereas if I make it through the save to pdf in chrome, I get the second output.
I have set the margins explicitly and used preferCSSPageSize: true and both give the same (left) outcome.
How would I get Playwright to give me the same output as chrome does from the print dialog?
An example of the files being printed is here. (In real life, they're all slightly different to account for the spine width.)
const fs = require("fs");
const path = require("path");
const { chromium } = require("playwright");
const directoryPath = "outside";
(async () => {
const filesToPrint = getFilesToPrintList(directoryPath);
filesToPrint.forEach((f, i) => console.log(i, f));
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
for (let i = 0; i < filesToPrint.length; i++) {
const htmlFilename = path.join(directoryPath, filesToPrint[i]);
const pdfFilename = makePDFfilename(htmlFilename);
console.log("[html file]", htmlFilename);
console.log("[pdf file]", pdfFilename);
const page = await context.newPage();
await page.goto(
"file:///" + path.resolve(htmlFilename),
(waitUntil = "networkidle")
);
const options = {
path: pdfFilename,
pageRanges: "1",
preferCSSPageSize: true,
};
console.log(
`\n\nAbout to print:\n ${pdfFilename}\n`,
`with:\n${JSON.stringify(options, null, 2)}`
);
await page.pdf(options);
}
await browser.close();
console.log("done");
})();
function makePDFfilename(htmlFilename) {
const parts = htmlFilename.split(/[\\\._]/);
const pdfFilename = path.join(
directoryPath,
`Experimental_${parts[1]}_${parts[2]}.pdf`
);
return pdfFilename;
}
function getFilesToPrintList(directoryPath) {
let theFiles = fs
.readdirSync(directoryPath)
.filter((f) => f.includes("fp_") && f.includes(".html"));
return theFiles;
}
I tried to reproduce your issue:
I used a different background image URL (online) with your HTML page, see test.html gist.
Apparently, a number of options are the defaults. See page.pdf().
Also, added browser.newContext() and browser.close().
Here's the minimal working NodeJS code:
generate-pdf.js
const { chromium } = require("playwright");
(async () => {
const htmlFilename = "file:///<file-path>/test.html";
console.log("[webpage]", htmlFilename);
const browser = await chromium.launch();
const context = await browser.newContext();
const page = await context.newPage();
await page.goto(htmlFilename);
const pdfFilename = "./test.pdf";
console.log("[pdf file]", pdfFilename);
const options = {
path: pdfFilename,
pageRanges: "1",
preferCSSPageSize: true
};
console.log(
`\n\nAbout to print:\n ${pdfFilename}\n`,
`with:\n${JSON.stringify(options, null, 2)}`
);
await page.pdf(options);
await browser.close();
console.log("done");
})();
Console Output:
$ node generate-pdf.js
[webpage] file:///<file-path>/test.html
[pdf file] ./test.pdf
About to print:
./test.pdf
with:
{
"path": "./test.pdf",
"pageRanges": "1",
"preferCSSPageSize": true
}
done
Snapshot of PDF (test.pdf):
You might want to test this separately and see if it works for your use-case.

reuse browser instance puppeterr

I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle. And in a second .js file use file one browser instance and its page.
1.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
ignoreDefaultArgs: ["--hide-scrollbars"]
});
const page = await browser.newPage();
const response = await page.goto('https://google.com');
console.log('Browser open in the background (headless)!');
//await browser.close();
})();
2.js
const puppeteer = require('puppeteer');
(async () => {
// instructions on browser instance/page from 1.js ...
})();
The crawler object keeps the state of the browser instance and
wherever you call/pass that instance, it refers to the same chromium
in the "background". If this is an overkill, and you just want to
connect to an already running chromium using puppeteer, you can do it
with puppeteer.connect. take a look at this:
How to "hook in" puppeteer into a running Chrome instance/tab – mbit
Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.
This Is a sample what I have.
// 1.js
// open chromium, new tab, go to google.com, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.launch({headless: false});
var page = await browser.newPage();
var response = await page.goto('https://google.com');
var browserWSEndpoint = browser.wsEndpoint();
console.log(browserWSEndpoint); // prints: ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e
browser.disconnect();
})();
And
// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect.
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.connect({browserWSEndpoint: 'ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e'});
// somehow use the tab that is open from 1.js (google.com)
await browser.disconnect();
})();
I get the browserWSEndpoint string from the console.log 1.js.
It works great but I have two difficulties.
1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.
2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.
Working tested code
getEmail.js is where actual page will be exported. ask clarifications in comments.
getBrowser.js
const puppeteer = require("puppeteer");
module.exports = {
browser: {},
pptr_instance_url:"",
getBrow: async function(){ try {
console.log("line6",this.pptr_instance_url);
this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
console.log("end point",this.pptr_instance_url);
this.browser = await puppeteer.launch({timeout: 0});
this.pptr_instance_url = this.browser.wsEndpoint();
console.log("line 11",this.pptr_instance_url);
return this.browser;
});
return this.browser;
}catch (e){
console.log(e)
} }
}
pageRenderer.js
const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
localStorageObject = {[uuidStorageKey]: request.body};
const browser = await abc.getBrow();
let url = "someurl.com"
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
url,
{
waitUntil: "networkidle0"
}, {waitUntil: 'load', timeout: 0}
);
return page;
}
module.exports = pageRenderer;
getEmail.js
const pageRenderer = require("./pageRenderer");
const getEmail =async (request) =>{
const page = await pageRenderer(request)
const emailbody = await page.content();
page.close();
return emailbody;
}
module.exports = getEmail;
You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.
You can have a class that launches the browser and creates pages plus some extra functionalities.
//1.js
const puppeteer = require('puppeteer');
class Crawler {
constructor() {
//init with whatever values you'll need in your class
//or throw an error if the object wasn't created through build
}
static async build() {
let crawler = new Crawler();
await crawler._init();
return crawler;
}
async _init() {
//launch the browser and keep its state
this._browser = await puppeteer.launch({timeout: 0});
//create a page and keep its state
this._page = await this._browser.newPage();
}
//getter
get browser() {
return this._browser;
}
//getter
get page() {
return this._page;
}
async login(url) {
await this._page.goto(url);
//do whatever is related to the login process
}
}
module.exports = {Crawler};
Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:
//2.js
const {Crawler} = require('./1.js');
(async() => {
let crawler = await Crawler.build();
await crawler.login("https://example.com");
//access crawler's page
console.log(crawler.page.url());
})();
Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.
Read more on JS classes here

Resources