I have a small application on puppeteer-extra, it works through a proxy server, sometimes the proxy server crashes and I get this error on the page.
if you click the "reload" button, the page will refresh and everything will be fine.
But how can I do it programmatically?
How do I catch such a tab drop?
require('dotenv').config();
const puppeteer = require('puppeteer-extra')
const PuppeteerExtraPluginProxy = require('puppeteer-extra-plugin-proxy2')
const pluginStealth = require('puppeteer-extra-plugin-stealth')
const sleep = require('./src/ToolsSleep');
async function main() {
puppeteer.use(PuppeteerExtraPluginProxy({
proxy: 'socks://username:password#gateproxy.com:6969',
}))
puppeteer.use(pluginStealth());
let file_link = await fetchLinkPage();
let browser = await puppeteer.launch({
headless: false,
userDataDir: './var/prof',
args: [
'--window-size=1200,1400',
'--window-position=000,000',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-web-security',
'--disable-features=IsolateOrigins',
'--disable-site-isolation-trials'
]
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders({ referer: file_link.referer })
await page.goto(file_link.link);
let pages = await browser.pages();
while (true) {
for await (let tab of pages) {
await sleep(1500);
if (await isDesiredPage(tab)) {
await DesiredPage(tab);
}else{
// we will close the ad if it is in other tabs
await tab.close();
}
}
await sleep(500);
}
}
main().catch((e) => {
throw e
})
I want to make sure that my "relaod" button is pressed automatically when the tab drops. How do I do this?
Related
I need to open the page of policy of a website (for example https://codedec.com/privacy-policy/) from the html page of the website
with the website url ( https://codedec.com/) scraping the policy link this is my script and when I run it I got this error :
Page.goto
(C:\Users\Wijden\Desktop\testWork\node_modules\puppeteer\lib\cjs\pupp
at scrapePolicyLinks (C:\Users\Wijden\Desktop\testWork\app.js:69:16)
at processTicksAndRejections (node:internal/process/task_queues:96:5)
{ originalMessage: 'Cannot navigate to invalid URL'
{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://codedec.com/');
let policy_link = await page.evaluate(() => {
let policy_btn = document.querySelector(".#menu-item-70 > a");
return policy_btn.getAttribute("href");
});
console.log(policy_btn.getAttribute("href"));
return policy_link;
}
console.log(getPolicyLink);
getPolicyLink()
async function scrapePolicyLinks()
{
let browser = await puppeteer.launch({ headless: false }); //headless:false so we can watch the browser as it works
let page = await browser.newPage(); //open a new page
await page.goto("policy_link"); //access the policy page
}
scrapePolicyLinks()```
I am trying to click the "Create File" button on fakebook's download your information page. I am currently able to goto the page, and I wait for the login process to finish. However, when I try to detect the button using
page.$x("//div[contains(text(),'Create File')]")
nothing is found. The same thing occurs when I try to find it in the chrome dev tools console, both in a puppeteer window and in a regular window outside of the instance of chrome puppeteer is controlling:
This is the html info for the element:
I am able to find the element however after I have clicked on it using the chrome dev tools inspector tool:
(the second print statement is from after I have clicked on it with the element inspector tool)
How should I select this element? I am new to puppeteer and to xpath so I apologize if I just missed something obvious.
A small few links I currently remember looking at previously:
Puppeteer can't find selector
puppeteer cannot find element
puppeteer: how to wait until an element is visible?
My Code:
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
(async () => {
let browser;
try {
puppeteer.use(StealthPlugin());
browser = await puppeteer.launch({
headless: false,
// path: "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
args: ["--disable-notifications"],
});
const pages = await browser.pages();
const page = pages[0];
const url = "https://www.facebook.com/dyi?referrer=yfi_settings";
await page.goto(url);
//Handle the login process. Since the login page is different from the url we want, I am going to assume the user
//has logged in if they return to the desired page.
//Wait for the login page to process
await page.waitForFunction(
(args) => {
return window.location.href !== args[0];
},
{ polling: "mutation", timeout: 0 },
[url]
);
//Since multifactor auth can resend the user temporarly to the desired url, use a little debouncing to make sure the user is completely done signing in
// make sure there is no redirect for mfa
await page.waitForFunction(
async (args) => {
// function to make sure there is a debouncing delay between checking the url
// Taken from: https://stackoverflow.com/a/49813472/11072972
function delay(delayInms) {
return new Promise((resolve) => {
setTimeout(() => {
resolve(2);
}, delayInms);
});
}
if (window.location.href === args[0]) {
await delay(2000);
return window.location.href === args[0];
}
return false;
},
{ polling: "mutation", timeout: 0 },
[url]
);
// await page.waitForRequest(url, { timeout: 100000 });
const requestArchiveXpath = "//div[contains(text(),'Create File')]";
await page.waitForXPath(requestArchiveXpath);
const [requestArchiveSelector] = await page.$x(requestArchiveXpath);
await page.click(requestArchiveSelector);
page.waitForTimeout(3000);
} catch (e) {
console.log("End Error: ", e);
} finally {
if (browser) {
await browser.close();
}
}
})();
Resolved using the comment above by #vsemozhebuty and source. Only the last few lines inside the try must change:
const iframeXpath = "//iframe[not(#hidden)]";
const requestArchiveXpath = "//div[contains(text(),'Create File')]";
//Wait for and get iframe
await page.waitForXPath(iframeXpath);
const [iframeHandle] = await page.$x(iframeXpath);
//content frame for iframe => https://devdocs.io/puppeteer/index#elementhandlecontentframe
const frame = await iframeHandle.contentFrame();
//Wait for and get button
await frame.waitForXPath(requestArchiveXpath);
const [requestArchiveSelector] = await frame.$x(requestArchiveXpath);
//click button
await requestArchiveSelector.click();
await page.waitForTimeout(3000);
I've tried few things i.e.
await page.click('.ytp-fullscreen-button.ytp-button') // click on fullscreen button
await page.keyboard.press('f') // press f to open fullscreen
await page.keyboard.down('f'); await page.keyboard.up('f'); //similar to previous
await page.evaluate(() => document.getElementsByClassName('ytp-fullscreen-button ytp-button')[0].click()) //injecting js and using it to click on fullscreen button
but nothing worked, is there a way to enter fullscreen mode on youtube using puppeteer?
This seems working for me:
import puppeteer from 'puppeteer';
const browser = await puppeteer.launch({ headless: false, defaultViewport: null });
try {
const [page] = await browser.pages();
// David Lynch's Weather Report 7/22/21
await page.goto('https://www.youtube.com/watch?v=MlyNWpf1N0s');
await page.waitForSelector('.ytp-fullscreen-button.ytp-button');
await page.evaluate(() => {
document.querySelector('.ytp-fullscreen-button.ytp-button').click();
});
} catch (err) { console.error(err); }
I am trying to scrape some startups data of a site with puppeteer and when I try to navigate to the next page the cloudflare waiting screen comes in and disrupts the scraper. I tried changing the IP but its still the same. Is there a way to bypass it with puppeteer.
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
});
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0);
let links = [];
// initial page
await page.goto(`https://www.startupranking.com/top/india`, {
waitUntil: "networkidle0",
});
// looping through the url to different pages
for (let i = 2; i <= 7; i++) {
if (i === 3) {
console.log("waiting");
await page.waitFor(20000);
console.log("waited");
}
const onPageLinks = await page.$$eval("tr .name a", (arr) =>
arr.map((cur) => cur.href)
);
links = links.concat(onPageLinks);
console.log(onPageLinks, "inside loop");
await page.goto(`https://www.startupranking.com/top/india/${i}`, {
waitUntil: "networkidle0",
});
}
console.log(links, links.length, "outside loop");
})();
As it is only checking for the first loop i put in a waitFor to bypass the time it takes to check, it works fine on some IP's but on others it gives challenges to solve, I have to run this on a server so I am thinking of bypassing it completely.
I have created a Puppeteer script to run in offline, I have got the below code to take the screenshot. While running the offline-login-check.js script from the command prompt, could some one please advise where the screen shots are added ?
const puppeteer = require("puppeteer");
(async() => {
const browser = await puppeteer.launch({
headless: true,
chromeWebSecurity: false,
args: ['--no-sandbox']
});
try {
// Create a new page
const page = await browser.newPage()
// Connect to Chrome DevTools
const client = await page.target().createCDPSession()
// Navigate and take a screenshot
await page.waitFor(3000);
await page.goto('https://sometestsite.net/home',{waitUntil: 'networkidle0'})
//await page.goto(url, {waitUntil: 'networkidle0'});
await page.evaluate('navigator.serviceWorker.ready');
console.log('Going offline');
await page.setOfflineMode(true);
// Does === true for the main page but the fallback content isn't being served.
page.on('response', r => console.log(r.fromServiceWorker()));
await page.reload({waitUntil: 'networkidle0'});
await page.waitFor(5000);
await page.screenshot({path: 'screenshot.png',fullPage: true})
await page.waitForSelector('mat-card[id="route-tile-card]');
await page.click('mat-card[id="route-tile-card]');
await page.waitFor(3000);
} catch(e) {
// handle initialization error
console.log ("Timeout or other error: ", e)
}
await browser.close();
})();
const puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch({
headless: false,
chromeWebSecurity: false,
args: ['--no-sandbox']
});
try {
// Create a new page
const page = await browser.newPage();
// Connect to Chrome DevTools
const client = await page.target().createCDPSession();
// Navigate and take a screenshot
await page.goto('https://example.com', {waitUntil: 'networkidle0'});
// await page.evaluate('navigator.serviceWorker.ready');
console.log('Going offline');
await page.setOfflineMode(true);
// Does === true for the main page but the fallback content isn't being served.
page.on('response', r => console.log(r.fromServiceWorker()));
await page.reload({waitUntil: 'networkidle0'});
await page.screenshot({path: 'screenshot2.png',fullPage: true})
// await page.waitForSelector('mat-card[id="route-tile-card]');
// await page.click('mat-card[id="route-tile-card]');
} catch(e) {
// handle initialization error
console.log ("Timeout or other error: ", e)
}
await browser.close();
})();
then in command line run ls | GREP .png and you should see screenshot there. Be aware i take rid of await page.evaluate('navigator.serviceWorker.ready'); which might be specified to your website
Your script is perfect. There is no problem with it!
The screenshot.png should be on the directory that you run the node offline-login-check.js command.
If its not there, maybe you are getting some error/timeout before the page.screenshot command runs. Since your script is ok, this can be caused by network issues or issues with the page. For example, if your page has a never ending connection (like WebSocket), change the "networkidle0" to "networkidle2" or "load", otherwise the first page.goto will get stuck.
Again, your script is perfect. You don't have to change it.