How to catch a tab drop in puppeteer-extra and refresh the page? - node.js

I have a small application on puppeteer-extra, it works through a proxy server, sometimes the proxy server crashes and I get this error on the page.
if you click the "reload" button, the page will refresh and everything will be fine.
But how can I do it programmatically?
How do I catch such a tab drop?
require('dotenv').config();
const puppeteer = require('puppeteer-extra')
const PuppeteerExtraPluginProxy = require('puppeteer-extra-plugin-proxy2')
const pluginStealth = require('puppeteer-extra-plugin-stealth')
const sleep = require('./src/ToolsSleep');
async function main() {
puppeteer.use(PuppeteerExtraPluginProxy({
proxy: 'socks://username:password#gateproxy.com:6969',
}))
puppeteer.use(pluginStealth());
let file_link = await fetchLinkPage();
let browser = await puppeteer.launch({
headless: false,
userDataDir: './var/prof',
args: [
'--window-size=1200,1400',
'--window-position=000,000',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=IsolateOrigins',
'--disable-site-isolation-trials'
]
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders({ referer: file_link.referer })
await page.goto(file_link.link);
let pages = await browser.pages();
while (true) {
for await (let tab of pages) {
await sleep(1500);
if (await isDesiredPage(tab)) {
await DesiredPage(tab);
}else{
// we will close the ad if it is in other tabs
await tab.close();
}
}
await sleep(500);
}
}
main().catch((e) => {
throw e
})
I want to make sure that my "relaod" button is pressed automatically when the tab drops. How do I do this?

Related

WebScraping with puppeteer : Invalid link error for Policy_link

I need to open the page of policy of a website (for example https://codedec.com/privacy-policy/) from the html page of the website
with the website url ( https://codedec.com/) scraping the policy link this is my script and when I run it I got this error :
Page.goto
(C:\Users\Wijden\Desktop\testWork\node_modules\puppeteer\lib\cjs\pupp
at scrapePolicyLinks (C:\Users\Wijden\Desktop\testWork\app.js:69:16)
at processTicksAndRejections (node:internal/process/task_queues:96:5)
{ originalMessage: 'Cannot navigate to invalid URL'
{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://codedec.com/');
let policy_link = await page.evaluate(() => {
let policy_btn = document.querySelector(".#menu-item-70 > a");
return policy_btn.getAttribute("href");
});
console.log(policy_btn.getAttribute("href"));
return policy_link;
}
console.log(getPolicyLink);
getPolicyLink()
async function scrapePolicyLinks()
{
let browser = await puppeteer.launch({ headless: false }); //headless:false so we can watch the browser as it works
let page = await browser.newPage(); //open a new page
await page.goto("policy_link"); //access the policy page
}
scrapePolicyLinks()```

Puppeteer unable to find element using xPath contains(text()) until after element has been selected in chrome dev tools

I am trying to click the "Create File" button on fakebook's download your information page. I am currently able to goto the page, and I wait for the login process to finish. However, when I try to detect the button using
page.$x("//div[contains(text(),'Create File')]")
nothing is found. The same thing occurs when I try to find it in the chrome dev tools console, both in a puppeteer window and in a regular window outside of the instance of chrome puppeteer is controlling:
This is the html info for the element:
I am able to find the element however after I have clicked on it using the chrome dev tools inspector tool:
(the second print statement is from after I have clicked on it with the element inspector tool)
How should I select this element? I am new to puppeteer and to xpath so I apologize if I just missed something obvious.
A small few links I currently remember looking at previously:
Puppeteer can't find selector
puppeteer cannot find element
puppeteer: how to wait until an element is visible?
My Code:
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
(async () => {
let browser;
try {
puppeteer.use(StealthPlugin());
browser = await puppeteer.launch({
headless: false,
// path: "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
args: ["--disable-notifications"],
});
const pages = await browser.pages();
const page = pages[0];
const url = "https://www.facebook.com/dyi?referrer=yfi_settings";
await page.goto(url);
//Handle the login process. Since the login page is different from the url we want, I am going to assume the user
//has logged in if they return to the desired page.
//Wait for the login page to process
await page.waitForFunction(
(args) => {
return window.location.href !== args[0];
},
{ polling: "mutation", timeout: 0 },
[url]
);
//Since multifactor auth can resend the user temporarly to the desired url, use a little debouncing to make sure the user is completely done signing in
// make sure there is no redirect for mfa
await page.waitForFunction(
async (args) => {
// function to make sure there is a debouncing delay between checking the url
// Taken from: https://stackoverflow.com/a/49813472/11072972
function delay(delayInms) {
return new Promise((resolve) => {
setTimeout(() => {
resolve(2);
}, delayInms);
});
}
if (window.location.href === args[0]) {
await delay(2000);
return window.location.href === args[0];
}
return false;
},
{ polling: "mutation", timeout: 0 },
[url]
);
// await page.waitForRequest(url, { timeout: 100000 });
const requestArchiveXpath = "//div[contains(text(),'Create File')]";
await page.waitForXPath(requestArchiveXpath);
const [requestArchiveSelector] = await page.$x(requestArchiveXpath);
await page.click(requestArchiveSelector);
page.waitForTimeout(3000);
} catch (e) {
console.log("End Error: ", e);
} finally {
if (browser) {
await browser.close();
}
}
})();
Resolved using the comment above by #vsemozhebuty and source. Only the last few lines inside the try must change:
const iframeXpath = "//iframe[not(#hidden)]";
const requestArchiveXpath = "//div[contains(text(),'Create File')]";
//Wait for and get iframe
await page.waitForXPath(iframeXpath);
const [iframeHandle] = await page.$x(iframeXpath);
//content frame for iframe => https://devdocs.io/puppeteer/index#elementhandlecontentframe
const frame = await iframeHandle.contentFrame();
//Wait for and get button
await frame.waitForXPath(requestArchiveXpath);
const [requestArchiveSelector] = await frame.$x(requestArchiveXpath);
//click button
await requestArchiveSelector.click();
await page.waitForTimeout(3000);

How to open youtube video in fullscreen mode with puppeteer?

I've tried few things i.e.
await page.click('.ytp-fullscreen-button.ytp-button') // click on fullscreen button
await page.keyboard.press('f') // press f to open fullscreen
await page.keyboard.down('f'); await page.keyboard.up('f'); //similar to previous
await page.evaluate(() => document.getElementsByClassName('ytp-fullscreen-button ytp-button')[0].click()) //injecting js and using it to click on fullscreen button
but nothing worked, is there a way to enter fullscreen mode on youtube using puppeteer?
This seems working for me:
import puppeteer from 'puppeteer';
const browser = await puppeteer.launch({ headless: false, defaultViewport: null });
try {
const [page] = await browser.pages();
// David Lynch's Weather Report 7/22/21
await page.goto('https://www.youtube.com/watch?v=MlyNWpf1N0s');
await page.waitForSelector('.ytp-fullscreen-button.ytp-button');
await page.evaluate(() => {
document.querySelector('.ytp-fullscreen-button.ytp-button').click();
});
} catch (err) { console.error(err); }

Bypass Cloudflare with puppeteer

I am trying to scrape some startups data of a site with puppeteer and when I try to navigate to the next page the cloudflare waiting screen comes in and disrupts the scraper. I tried changing the IP but its still the same. Is there a way to bypass it with puppeteer.
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0);
let links = [];
// initial page
await page.goto(`https://www.startupranking.com/top/india`, {
waitUntil: "networkidle0",
});
// looping through the url to different pages
for (let i = 2; i <= 7; i++) {
if (i === 3) {
console.log("waiting");
await page.waitFor(20000);
console.log("waited");
}
const onPageLinks = await page.$$eval("tr .name a", (arr) =>
arr.map((cur) => cur.href)
);
links = links.concat(onPageLinks);
console.log(onPageLinks, "inside loop");
await page.goto(`https://www.startupranking.com/top/india/${i}`, {
waitUntil: "networkidle0",
});
}
console.log(links, links.length, "outside loop");
})();
As it is only checking for the first loop i put in a waitFor to bypass the time it takes to check, it works fine on some IP's but on others it gives challenges to solve, I have to run this on a server so I am thinking of bypassing it completely.

Screenshots location while running the puppeteer script

I have created a Puppeteer script to run in offline, I have got the below code to take the screenshot. While running the offline-login-check.js script from the command prompt, could some one please advise where the screen shots are added ?
const puppeteer = require("puppeteer");
(async() => {
const browser = await puppeteer.launch({
headless: true,
chromeWebSecurity: false,
args: ['--no-sandbox']
});
try {
// Create a new page
const page = await browser.newPage()
// Connect to Chrome DevTools
const client = await page.target().createCDPSession()
// Navigate and take a screenshot
await page.waitFor(3000);
await page.goto('https://sometestsite.net/home',{waitUntil: 'networkidle0'})
//await page.goto(url, {waitUntil: 'networkidle0'});
await page.evaluate('navigator.serviceWorker.ready');
console.log('Going offline');
await page.setOfflineMode(true);
// Does === true for the main page but the fallback content isn't being served.
page.on('response', r => console.log(r.fromServiceWorker()));
await page.reload({waitUntil: 'networkidle0'});
await page.waitFor(5000);
await page.screenshot({path: 'screenshot.png',fullPage: true})
await page.waitForSelector('mat-card[id="route-tile-card]');
await page.click('mat-card[id="route-tile-card]');
await page.waitFor(3000);
} catch(e) {
// handle initialization error
console.log ("Timeout or other error: ", e)
}
await browser.close();
})();
const puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch({
headless: false,
chromeWebSecurity: false,
args: ['--no-sandbox']
});
try {
// Create a new page
const page = await browser.newPage();
// Connect to Chrome DevTools
const client = await page.target().createCDPSession();
// Navigate and take a screenshot
await page.goto('https://example.com', {waitUntil: 'networkidle0'});
// await page.evaluate('navigator.serviceWorker.ready');
console.log('Going offline');
await page.setOfflineMode(true);
// Does === true for the main page but the fallback content isn't being served.
page.on('response', r => console.log(r.fromServiceWorker()));
await page.reload({waitUntil: 'networkidle0'});
await page.screenshot({path: 'screenshot2.png',fullPage: true})
// await page.waitForSelector('mat-card[id="route-tile-card]');
// await page.click('mat-card[id="route-tile-card]');
} catch(e) {
// handle initialization error
console.log ("Timeout or other error: ", e)
}
await browser.close();
})();
then in command line run ls | GREP .png and you should see screenshot there. Be aware i take rid of await page.evaluate('navigator.serviceWorker.ready'); which might be specified to your website
Your script is perfect. There is no problem with it!
The screenshot.png should be on the directory that you run the node offline-login-check.js command.
If its not there, maybe you are getting some error/timeout before the page.screenshot command runs. Since your script is ok, this can be caused by network issues or issues with the page. For example, if your page has a never ending connection (like WebSocket), change the "networkidle0" to "networkidle2" or "load", otherwise the first page.goto will get stuck.
Again, your script is perfect. You don't have to change it.

Resources