Chrome DevTools Protocol: switch 'active' frame - node.js

I'm using Chrome Headless and chrome-remote-interface npm package
const chromeLauncher = require('chrome-launcher')
const CDP = require('chrome-remote-interface')
const fs = require('fs')
async function launchChrome() {
return chromeLauncher.launch({
port: 9222,
chromeFlags: ['--disable-gpu', '--headless']
})
}
(async function () {
const chrome = await launchChrome()
const client = await CDP({port: 9222})
const {Page} = client
await Page.enable()
await Page.navigate({url: 'http://...'})
await Page.loadEventFired()
const {data} = await Page.captureScreenshot()
fs.writeFileSync('screenshot.png', Buffer.from(data, 'base64'))
await protocol.close()
await chrome.kill()
})()
But the page I'm loading contains an iframe and I want to perform some actions inside it (click some elements or access them via DOM.querySelector) before taking the screenshot.
Is it possible to switch current 'active' frame somehow via Chrome DevTools Protocol? Like it is possible in Nightwatch.js/Selenium: browser.frame(frameIndex)

Related

How to export this node.js function in react.js

i am working on a dashboard were the user should be able to click on a button and then get some data that would have been scraped from a site. I used puppeteer and it prints the desired data to the console but now how can i add this to my react js application ?
Here's the puppeteer code i wrote :
const puppeteer = require('puppeteer');
async function scrapeProduct(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const [el] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[1]/a/img');
const src = await el.getProperty('src');
const srcTxt = await src.jsonValue();
const [el2] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[2]/div/a/h3');
const txt = await el2.getProperty('textContent');
const rawTxt = await txt.jsonValue();
const [el3] = await page.$x('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div/div[2]/div/div');
const txt2 = await el3.getProperty('textContent');
const tags = await txt2.jsonValue();
console.log({srcTxt, rawTxt, tags});
browser.close();
}
scrapeProduct('https://2degrees-investing.org/resources/');
Is there a way to export this function and use it as an import somewhere ? Thanks in advance.

How to remember authorization when parsing a site on nodejs cookies

i go to the site and look for an authorization form, if I find it, then I log in, if not, then I do the actions I need
let cookie_ = fs.readFileSync("cookies.json");//I am looking for a file with saved cookies
cookie = JSON.parse(cookie_);//Converting to json
nightmare
.goto('https://site.ru/login')//I go to the site
.cookies.set(cookie)//I substitute cookies from the file
.evaluate(function () {
return document.querySelector('input[id="email"]');//I am looking for a field to enter mail
})
.then(function (page) {
if(page) {//I check if there is a field for entering mail
f().then(function (cookies) {//We get the result from the function
require('fs').writeFileSync(//And write to file
'cookies.json',
JSON.stringify(cookies)
);
})
} else {
console.log('You are logged in');
}
})
async function f() {//I call the function if we are not authorized
return new Promise((resolve, reject) => {
nightmare
.goto('https://site.ru/login')
.type('input[id="email"]', 'login')//Enter mail
.type('input[id="password"]', 'passord')//Enter your password
.click('.btn.btn-danger')//Click on the authorization button
.wait(2000)//We wait 2 seconds
.cookies.get()//We receive cookies
.then(resolve)
});
}
The file is created, cookies are written, but with the next attempts to run the script, the authorization form still appears
I also tried first to go to - goto('about: blank') then set cookies and then go to goto('https://site.ru/login')
Error - UnhandledPromiseRejectionWarning: Error: Setting cookie failed
Unfortunately, it was not possible to solve the problem through nightmare
Solved with puppeteer
Example - saving cookies to file
const puppeteer = require('puppeteer')
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('https://github.com/login')
await page.type('#login_field', process.env.GITHUB_USER)
await page.type('#password', process.env.GITHUB_PWD)
await page.waitForSelector('.js-cookie-consent-reject')
await page.click('.js-cookie-consent-reject')
await page.$eval('[name="commit"]', (elem) => elem.click())
await page.waitForNavigation()
const cookies = await page.cookies()
const cookieJson = JSON.stringify(cookies)
fs.writeFileSync('cookies.json', cookieJson)
await browser.close()
})()
Reading cookies from a file
const puppeteer = require('puppeteer')
const fs = require('fs');
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
const cookies = fs.readFileSync('cookies.json', 'utf8')
const deserializedCookies = JSON.parse(cookies)
await page.setCookie(...deserializedCookies)
await page.goto(`https://github.com/${process.env.GITHUB_USER}`)
await browser.close()
})()
Article

Why am I not able to navigate through iFrames using Apify/Puppeteer?

I'm trying to manipulate forms of sites w/ iFrames in it using Puppeteer. I tried different ways to reach a specific iFrame, or even to count iFrames in a website, with no success.
Why isn't Puppeteer's object recognizing the iFrames / child frames of the page I'm trying to navigate through?
It's happening with other pages as well, such as https://www.veiculos.itau.com.br/simulacao
const Apify = require('apify');
const sleep = require('sleep-promise');
Apify.main(async () => {
// Launch the web browser.
const browser = await Apify.launchPuppeteer();
// Create and navigate new page
console.log('Open target page');
const page = await browser.newPage();
await page.goto('https://www.credlineitau.com.br/');
await sleep(15 * 1000);
for (const frame in page.mainFrame().childFrames()) {
console.log('test');
}
await browser.close();
});
Perhaps you'll find some helpful inspiration below.
const waitForIframeContent = async (page, frameSelector, contentSelector) => {
await page.waitForFunction((frameSelector, contentSelector) => {
const frame = document.querySelector(frameSelector);
const node = frame.contentDocument.querySelector(contentSelector);
return node && node.innerText;
}, {
timeout: TIMEOUTS.ten,
}, frameSelector, contentSelector);
};
const $frame = await waitForSelector(page, SELECTORS.frame.iframeNode).catch(() => null);
if ($frame) {
const frame = page.frames().find(frame => frame.name() === 'content-iframe');
const $cancelStatus = await waitForSelector(frame, SELECTORS.frame.membership.cancelStatus).catch(() => null);
await waitForIframeContent(page, SELECTORS.frame.iframeNode, SELECTORS.frame.membership.cancelStatus);
}
Give it a shot.

How to authentication using session cookie in puppeteer

I want to store my session cookie and authenticate my account using puppeteer.
Right now I'm using my username and password directly to authenticate.
const puppeteer = require('puppeteer');
(async ()=>{
const browser= await puppeteer.launch({
"headless": false,
"slowMo":20
});
const page= await browser.newPage();
await page.goto("https://www.linkedin.com/login");
await page.type('[id=username]','username');
await page.type('[id=password]','password');
await page.keyboard.press('Enter',{delay:2000});
await browser.close();
})();
Here below is an example of how to login on a web app using Puppeteer. You need to install apify (a npm module).
const Apify = require('apify');
Apify.main(async () => {
const input = await Apify.getValue('INPUT');
const browser = await Apify.launchPuppeteer();
const page = await browser.newPage();
await page.goto('https://facebook.com');
// Login
await page.type('#email', input.username);
await page.type('#pass', input.password);
await page.click('#loginbutton input');
await page.waitForNavigation();
// Get cookies
const cookies = await page.cookies();
// Use cookies in other tab or browser
const page2 = await browser.newPage();
await page2.setCookie(...cookies);
await page2.goto('https://facebook.com'); // Opens page as logged user
await browser.close();
console.log('Done.');
});
To Save the Session Cookies in puppeteer.
const cookiesObject = await page.cookies()
// Write cookies to temp file to be used in other profile pages
jsonfile.writeFile(cookiesFilePath, cookiesObject, { spaces: 2 },
function(err) {
if (err) {
console.log('The file could not be written.', err)
}
console.log('Session has been successfully saved')
})
Then, on your next iteration right before using page.goto() you can call page.setCookie() to load the cookies from the file one by one.
const previousSession = fileExistSync(cookiesFilePath)
if (previousSession) {
// If file exist load the cookies
const cookiesArr = require(`.${cookiesFilePath}`)
if (cookiesArr.length !== 0) {
for (let cookie of cookiesArr) {
await page.setCookie(cookie)
}
console.log('Session has been loaded in the browser!')
return true
}
}
The CDPSession instances are used to talk raw Chrome Devtools Protocol:
Protocol methods can be called with session.send method.
Protocol events can be subscribed to with session.on method.
Here are the official links for these as follows:
https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#pagecookiesurls
https://github.com/puppeteer/puppeteer/blob/master/docs/api.md#pagesetcookiecookies

reuse browser instance puppeterr

I would like to know if it is possible to have one .js file that opens a browser instance, creates new page/tab logs in to a website (with username/password) and just stays idle. And in a second .js file use file one browser instance and its page.
1.js
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox'],
ignoreDefaultArgs: ["--hide-scrollbars"]
});
const page = await browser.newPage();
const response = await page.goto('https://google.com');
console.log('Browser open in the background (headless)!');
//await browser.close();
})();
2.js
const puppeteer = require('puppeteer');
(async () => {
// instructions on browser instance/page from 1.js ...
})();
The crawler object keeps the state of the browser instance and
wherever you call/pass that instance, it refers to the same chromium
in the "background". If this is an overkill, and you just want to
connect to an already running chromium using puppeteer, you can do it
with puppeteer.connect. take a look at this:
How to "hook in" puppeteer into a running Chrome instance/tab – mbit
Yeah I guess its to overkill for me :). But the link you posted was what I wanted but have 2 questions.
This Is a sample what I have.
// 1.js
// open chromium, new tab, go to google.com, print browserWSEndpoint, disconnect
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.launch({headless: false});
var page = await browser.newPage();
var response = await page.goto('https://google.com');
var browserWSEndpoint = browser.wsEndpoint();
console.log(browserWSEndpoint); // prints: ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e
browser.disconnect();
})();
And
// 2.js
// connect to the open browser with the browserWSEndpoint manualy put in, ... , disconect.
const puppeteer = require('puppeteer');
(async () => {
var browser = await puppeteer.connect({browserWSEndpoint: 'ws://127.0.0.1:51945/devtools/browser/6462daeb-469b-4ae4-bfd1-c3bd2f26aa5e'});
// somehow use the tab that is open from 1.js (google.com)
await browser.disconnect();
})();
I get the browserWSEndpoint string from the console.log 1.js.
It works great but I have two difficulties.
1 - How can I use the variable browserWSEndpoint from 1.js so I dont have to always copy paste it to 2.js.
2- If I open a new page/tab on 1.js and go for example to google and disconnect (browser.disconnect()), how can use that page/tab on 2.js.
Working tested code
getEmail.js is where actual page will be exported. ask clarifications in comments.
getBrowser.js
const puppeteer = require("puppeteer");
module.exports = {
browser: {},
pptr_instance_url:"",
getBrow: async function(){ try {
console.log("line6",this.pptr_instance_url);
this.browser = await puppeteer.connect({browserWSEndpoint: this.pptr_instance_url}).catch(async e =>{
console.log("end point",this.pptr_instance_url);
this.browser = await puppeteer.launch({timeout: 0});
this.pptr_instance_url = this.browser.wsEndpoint();
console.log("line 11",this.pptr_instance_url);
return this.browser;
});
return this.browser;
}catch (e){
console.log(e)
} }
}
pageRenderer.js
const abc = require("../getBrowsernew")
const pageRenderer = async (request) => {
const {reactProjectUrl} = constants, uuidStorageKey = uuidv4(),
localStorageObject = {[uuidStorageKey]: request.body};
const browser = await abc.getBrow();
let url = "someurl.com"
await setLocalStorage(browser, url, localStorageObject);
const page = await browser.newPage();
const response = await page.goto(
url,
{
waitUntil: "networkidle0"
}, {waitUntil: 'load', timeout: 0}
);
return page;
}
module.exports = pageRenderer;
getEmail.js
const pageRenderer = require("./pageRenderer");
const getEmail =async (request) =>{
const page = await pageRenderer(request)
const emailbody = await page.content();
page.close();
return emailbody;
}
module.exports = getEmail;
You can implement this in many ways like having separate modules with functions, or different classes, and it depends on your particular need.
You can have a class that launches the browser and creates pages plus some extra functionalities.
//1.js
const puppeteer = require('puppeteer');
class Crawler {
constructor() {
//init with whatever values you'll need in your class
//or throw an error if the object wasn't created through build
}
static async build() {
let crawler = new Crawler();
await crawler._init();
return crawler;
}
async _init() {
//launch the browser and keep its state
this._browser = await puppeteer.launch({timeout: 0});
//create a page and keep its state
this._page = await this._browser.newPage();
}
//getter
get browser() {
return this._browser;
}
//getter
get page() {
return this._page;
}
async login(url) {
await this._page.goto(url);
//do whatever is related to the login process
}
}
module.exports = {Crawler};
Note that we can't have async functions in the constructor. Since launching browser is async, we use something like a build function to initiate the browser when creating the object. Then we create the crawler object like this:
//2.js
const {Crawler} = require('./1.js');
(async() => {
let crawler = await Crawler.build();
await crawler.login("https://example.com");
//access crawler's page
console.log(crawler.page.url());
})();
Keep in mind that this is only an example and by no means representative of the best practices. So first, you need to understand what you want to achieve out of such encapsulation, then adopt the method that suits you best.
Read more on JS classes here

Resources