Nodejs/Puppeteer - Navigation timeout - node.js

I need help to undestand how timeout works, especially with node/puppeteer
I read all stack questions and github issues about this, but i can figure it out what is wrong
Probably my code...
When i run this file, i receive the error from image. You can see the ways i tryied to fix it, nothing works
Can someone explain why this happens and the best approach to avoid this? Is there a better way to get these Projects?
//vou até os seeds em x tempo
var https = require('https');
var Q = require('q');
var fs = require('fs');
var puppeteer = require('puppeteer');
var Projeto = require('./Projeto.js');
const url = 'https://www.99freelas.com.br/projects?categoria=web-e-desenvolvimento'
/*const idToScrape;
deverá receber qual a url e os parametros específicos de cada seed */
async function genScraper() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
//page.setDefaultNavigationTimeout(60000);
page.waitForNavigation( { timeout: 60000, waitUntil: 'domcontentloaded' });
await page.goto(url);
var projetos = await page.evaluate(() => {
let qtProjs = document.querySelectorAll('.result-list li').length;
let listaDeProjs = Array.from(document.querySelectorAll('.result-list li'));
let tempProjetos = [];
for( var i=0; i<=listaDeProjs.length; i++ ) {
let titulo = listaDeProjs[i].children[1].children[0].textContent;
let descricao = listaDeProjs[i].children[2].textContent;
let habilidades = listaDeProjs[i].children[3].textContent;
let publicado = listaDeProjs[i].children[1].children[1].children[0].textContent;
let tempoRestante = listaDeProjs[i].children[1].children[1].children[1].textContent;
//let infoCliente;
proj = new Projeto(titulo, descricao, habilidades, publicado, tempoRestante);
tempProjetos.push(proj);
}
return tempProjetos;
});
console.log(projetos);
browser.close();
}
genScraper();

I recommend you to avoid using the method waitForNavigation before the goTo call.
Basically, It would be better to use the method gotTo with the default value, that is 30000. In my opinion, if the website takes more than 30 seconds to work or respond, there should be something wrong.
Instead, I would do something like this:
await page.goto(url, {
waitUntil: 'networkidle0'
});
Depending on the version of puppeteer that you're using, you will have different behaviours. I am using version 1.4.0 and it is working good so far.
Inside the documentation states the following:
The page.goto will throw an error if:
there's an SSL error (e.g. in case of self-signed certificates).
target URL is invalid.
the timeout is exceeded during navigation.
the main resource failed to load.
So, check that none of the previous scenarios is happening.
Also, you can curl the URL from your terminal to see if the URL respond to outside calls, cross origin problems are common too.
Sincerely, there is no way to say what can be triggering your timeout, but that checklist should help. I had a problem with timeout recently and the problem was my server configuration, so I suggest you to see also if the machine in which you are running this code, has the necessary memory to execute.

In your for loop,
for( var i=0; i<=listaDeProjs; i++ ) {
...
}
listaDeProjs should be listaDeProjs.length
Your evaluation script will fail in several places, if anywhere along this path is undefined: (E.g., if children[1] is undefined or children[0] is undefined.)
listaDeProjs[i].children[1].children[0].textContent;
You can do the following with lodash:
_.get(listaDeProjs[i],"children[1].children[0].textContent","")
That will default to "" if there is no such value.
Additionally, the following works perfectly fine with your code in 1.7 via https://try-puppeteer.appspot.com/
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: '5000'
});

Related

Puppeteer pdf image not rendering correctly [duplicate]

I am working on creating PDF from web page.
The application on which I am working is single page application.
I tried many options and suggestion on https://github.com/GoogleChrome/puppeteer/issues/1412
But it is not working
const browser = await puppeteer.launch({
executablePath: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
ignoreHTTPSErrors: true,
headless: true,
devtools: false,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.goto(fullUrl, {
waitUntil: 'networkidle2'
});
await page.type('#username', 'scott');
await page.type('#password', 'tiger');
await page.click('#Login_Button');
await page.waitFor(2000);
await page.pdf({
path: outputFileName,
displayHeaderFooter: true,
headerTemplate: '',
footerTemplate: '',
printBackground: true,
format: 'A4'
});
What I want is to generate PDF report as soon as Page is loaded completely.
I don't want to write any type of delays i.e. await page.waitFor(2000);
I can not do waitForSelector because the page has charts and graphs which are rendered after calculations.
Help will be appreciated.
You can use page.waitForNavigation() to wait for the new page to load completely before generating a PDF:
await page.goto(fullUrl, {
waitUntil: 'networkidle0',
});
await page.type('#username', 'scott');
await page.type('#password', 'tiger');
await page.click('#Login_Button');
await page.waitForNavigation({
waitUntil: 'networkidle0',
});
await page.pdf({
path: outputFileName,
displayHeaderFooter: true,
headerTemplate: '',
footerTemplate: '',
printBackground: true,
format: 'A4',
});
If there is a certain element that is generated dynamically that you would like included in your PDF, consider using page.waitForSelector() to ensure that the content is visible:
await page.waitForSelector('#example', {
visible: true,
});
Sometimes the networkidle events do not always give an indication that the page has completely loaded. There could still be a few JS scripts modifying the content on the page. So watching for the completion of HTML source code modifications by the browser seems to be yielding better results. Here's a function you could use -
const waitTillHTMLRendered = async (page, timeout = 30000) => {
const checkDurationMsecs = 1000;
const maxChecks = timeout / checkDurationMsecs;
let lastHTMLSize = 0;
let checkCounts = 1;
let countStableSizeIterations = 0;
const minStableSizeIterations = 3;
while(checkCounts++ <= maxChecks){
let html = await page.content();
let currentHTMLSize = html.length;
let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length);
console.log('last: ', lastHTMLSize, ' <> curr: ', currentHTMLSize, " body html size: ", bodyHTMLSize);
if(lastHTMLSize != 0 && currentHTMLSize == lastHTMLSize)
countStableSizeIterations++;
else
countStableSizeIterations = 0; //reset the counter
if(countStableSizeIterations >= minStableSizeIterations) {
console.log("Page rendered fully..");
break;
}
lastHTMLSize = currentHTMLSize;
await page.waitForTimeout(checkDurationMsecs);
}
};
You could use this after the page load / click function call and before you process the page content. e.g.
await page.goto(url, {'timeout': 10000, 'waitUntil':'load'});
await waitTillHTMLRendered(page)
const data = await page.content()
In some cases, the best solution for me was:
await page.goto(url, { waitUntil: 'domcontentloaded' });
Some other options you could try are:
await page.goto(url, { waitUntil: 'load' });
await page.goto(url, { waitUntil: 'domcontentloaded' });
await page.goto(url, { waitUntil: 'networkidle0' });
await page.goto(url, { waitUntil: 'networkidle2' });
You can check this at puppeteer documentation:
https://pptr.dev/#?product=Puppeteer&version=v11.0.0&show=api-pagewaitfornavigationoptions
I always like to wait for selectors, as many of them are a great indicator that the page has fully loaded:
await page.waitForSelector('#blue-button');
In the latest Puppeteer version, networkidle2 worked for me:
await page.goto(url, { waitUntil: 'networkidle2' });
Wrap the page.click and page.waitForNavigation in a Promise.all
await Promise.all([
page.click('#submit_button'),
page.waitForNavigation({ waitUntil: 'networkidle0' })
]);
I encountered the same issue with networkidle when I was working on an offscreen renderer. I needed a WebGL-based engine to finish rendering and only then make a screenshot. What worked for me was a page.waitForFunction() method. In my case the usage was as follows:
await page.goto(url);
await page.waitForFunction("renderingCompleted === true")
const imageBuffer = await page.screenshot({});
In the rendering code, I was simply setting the renderingCompleted variable to true, when done. If you don't have access to the page code you can use some other existing identifier.
You can also use to ensure all elements have rendered
await page.waitFor('*')
Reference: https://github.com/puppeteer/puppeteer/issues/1875
As for December 2020, waitFor function is deprecated, as the warning inside the code tell:
waitFor is deprecated and will be removed in a future release. See
https://github.com/puppeteer/puppeteer/issues/6214 for details and how
to migrate your code.
You can use:
sleep(millisecondsCount) {
if (!millisecondsCount) {
return;
}
return new Promise(resolve => setTimeout(resolve, millisecondsCount)).catch();
}
And use it:
(async () => {
await sleep(1000);
})();
Keeping in mind the caveat that there's no silver bullet to handle all page loads, one strategy is to monitor the DOM until it's been stable (i.e. has not seen a mutation) for more than n milliseconds. This is similar to the network idle solution but geared towards the DOM rather than requests and therefore covers a different subset of loading behaviors.
Generally, this code would follow a page.waitForNavigation({waitUntil: "domcontentloaded"}) or page.goto(url, {waitUntil: "domcontentloaded"}), but you could also wait for it alongside, say, waitForNetworkIdle() using Promise.all() or Promise.race().
Here's a simple example:
const puppeteer = require("puppeteer"); // ^14.3.0
const waitForDOMStable = (
page,
options={timeout: 30000, idleTime: 2000}
) =>
page.evaluate(({timeout, idleTime}) =>
new Promise((resolve, reject) => {
setTimeout(() => {
observer.disconnect();
const msg = `timeout of ${timeout} ms ` +
"exceeded waiting for DOM to stabilize";
reject(Error(msg));
}, timeout);
const observer = new MutationObserver(() => {
clearTimeout(timeoutId);
timeoutId = setTimeout(finish, idleTime);
});
const config = {
attributes: true,
childList: true,
subtree: true
};
observer.observe(document.body, config);
const finish = () => {
observer.disconnect();
resolve();
};
let timeoutId = setTimeout(finish, idleTime);
}),
options
)
;
const html = `<!DOCTYPE html><html lang="en"><head>
<title>test</title></head><body><h1></h1><script>
(async () => {
for (let i = 0; i < 10; i++) {
document.querySelector("h1").textContent += i + " ";
await new Promise(r => setTimeout(r, 1000));
}
})();
</script></body></html>`;
let browser;
(async () => {
browser = await puppeteer.launch({headless: true});
const [page] = await browser.pages();
await page.setContent(html);
await waitForDOMStable(page);
console.log(await page.$eval("h1", el => el.textContent));
})()
.catch(err => console.error(err))
.finally(() => browser?.close())
;
For pages that continually mutate the DOM more often than the idle value, the timeout will eventually trigger and reject the promise, following the typical Puppeteer fallback. You can set a more aggressive overall timeout to fit your needs or tailor the logic to ignore (or only monitor) a particular subtree.
Answers so far haven't mentioned a critical fact: it's impossible to write a one-size-fits-all waitUntilPageLoaded function that works on every page. If it were possble, Puppeteer would surely provide it.
Such a function can't rely on a timeout, because there's always some page that takes longer to load than that timeout. As you extend the timeout to reduce the failure rate, you introduce unnecessary delays when working with fast pages. Timeouts are generally a poor solution, opting out of Puppeteer's event-driven model.
Waiting for idle network requests might not always work if the responses involve long-running DOM updates that take longer than 500ms to trigger a render.
Waiting for the DOM to stop changing might miss slow network requests, long-delayed JS triggers, or ongoing DOM manipulation that might cause the listener never to settle, unless specially handled.
And, of course, there's user interaction: captchas, prompts and cookie/subscription modals that need to be clicked through and dismissed before the page is in a sensible state for a full-page screenshot (for example).
Since every page has different, arbitrary JS behavior, the typical approach is to write event-driven logic that works for a specific page. Making precise, directed assumptions is much better than cobbling together a boatload of hacks that tries to solve every edge case.
If your use case is to write a load event that works on every page, my suggestion is to use some combination of the tools described here that is most balanced to meet your needs (speed vs. accuracy, development time/code complexitiy vs accuracy, etc). Use fail-safes for everything rather than blindly assuming all pages will cooperate with your assumptions. Think hard about what extent you really need to try to handle every web page. Prepare to compromise and accept some degree of failures you can live with.
Here's a quick rundown of the strategies you can mix and match to wait for loads to fit your needs:
page.goto() and page.waitForNavigation() default to the load event, which "is fired when the whole page has loaded, including all dependent resources such as stylesheets and images" (MDN), but this is often too pessimistic; there's no need to wait for a ton of data you don't care about. Often the data is available without waiting for all external resources, so domcontentloaded should be faster. See my post Avoiding Puppeteer Antipatterns for further discussion.
On the other hand, if there are JS-triggered networks requests after load, you'll miss that data. Hence networkidle2 and networkidle0, which wait 500 ms after the number of active network requests are 2 or 0. The motivation for the 2 version is that some sites keep ongoing requests open, which would cause networkidle0 to time out.
If you're waitng for a specific network response that might have a payload (or, for the general case, implementing your own network idle monitor), use page.waitForResponse(). page.waitForRequest(), page.waitForNetworkIdle() and page.on("request", ...) are also useful here.
If you're waiting for a particular selector to be visible, use page.waitForSelector(). If you're waiting for a load on a specific page, identify a selector that indicates the state you want to wait for. Generally speaking, for scripts specific to one page, this is the main tool to wait for the state you want, whether you're extracting data or clicking something. Frames and shadow roots thwart this function.
page.waitForFunction() lets you wait for an arbitrary predicate, for example, checking that the page's HTML or a specific list is a certain length. It's also useful for quickly dipping into frames and shadow roots to wait for predicates that depend on nested state. This function is also handy for detecting DOM mutations.
The most general tool is page.evaluate(), which plugs code into the browser. You can put just about any conditions you want here; most other Puppeteer functions are convenience wrappers for common cases you could implement by hand with evaluate.
I can't leave comments, but I made a python version of Anand's answer for anyone who finds it useful (i.e. if they use pyppeteer).
async def waitTillHTMLRendered(page: Page, timeout: int = 30000):
check_duration_m_secs = 1000
max_checks = timeout / check_duration_m_secs
last_HTML_size = 0
check_counts = 1
count_stable_size_iterations = 0
min_stabe_size_iterations = 3
while check_counts <= max_checks:
check_counts += 1
html = await page.content()
currentHTMLSize = len(html);
if(last_HTML_size != 0 and currentHTMLSize == last_HTML_size):
count_stable_size_iterations += 1
else:
count_stable_size_iterations = 0 # reset the counter
if(count_stable_size_iterations >= min_stabe_size_iterations):
break
last_HTML_size = currentHTMLSize
await page.waitFor(check_duration_m_secs)
For me the { waitUntil: 'domcontentloaded' } is always my go to.
I found that networkidle doesnt work well...

waitForSelector suddenly no longer working in puppeteer

I have a working puppeteer script that I'd like to make into an API but I'm having problems with waitForSelector.
Background:
I wrote a puppeteer script that successfully searches for and scrapes the result of a query I specify in the code e.g. let address = xyz;. Now I'd like to make it into an API so that a user can query something. I managed to code everything necessary for the local API (working with express) and everything works as well. By that I mean: I coded all the server side stuff: I can make a request, the scraper function is called, puppeteer starts up, carries out my search (I need to type in an address, choose from a dropdown and press enter).
The status:
The result of my query is a form (basically 3 columns and some rows) in an iFrame and I want to scrape all the rows (I modify them into a specific json later on). The way it works is I use waitForSelector on the form's selector and then I use frame.evaluate.
Problem:
When I run my normal scraper everything works well, but when I run the (slightly modified but essentially same) code within the API framework, waitForSelector suddenly always times out. I have tried all the usual workarounds: waitForNavigation, taking a screenshot and inspecting etc but nothing helped. I've been reading quite a bit and could it be that I'm screwing something up in terms of async/await when I call my scraper from within the context of the API? I'm still quite new to this so please bear with me. This is the code of the working script - I indicated the important part
const puppeteer = require("puppeteer");
const chalk = require("chalk");
const fs = require('fs');
const error = chalk.bold.red;
const success = chalk.keyword("green");
address = 'Gumpendorfer Straße 12, 1060 Wien';
(async () => {
try {
// open the headless browser
var browser = await puppeteer.launch();
// open a new page
var page = await browser.newPage();
// enter url in page
await page.goto(`https://mein.wien.gv.at/Meine-Amtswege/richtwert?subpage=/lagezuschlag/`, {waitUntil: 'networkidle2'});
// continue without newsletter
await page.click('#dss-modal-firstvisit-form > button.btn.btn-block.btn-light');
// let everyhting load
await page.waitFor(1000)
console.log('waiting for iframe with form to be ready.');
//wait until selector is available
await page.waitForSelector('iframe');
console.log('iframe is ready. Loading iframe content');
//choose the relevant iframe
const elementHandle = await page.$(
'iframe[src="/richtwertfrontend/lagezuschlag/"]',
);
//go into frame in order to input info
const frame = await elementHandle.contentFrame();
//enter address
console.log('filling form in iframe');
await frame.type('#input_adresse', address, { delay: 100});
//choose first option from dropdown
console.log('Choosing from dropdown');
await frame.click('#react-autowhatever-1--item-0');
console.log('pressing button');
//press button to search
await frame.click('#next-button');
// scraping data
console.log('scraping')
await frame.waitForSelector('#summary > div > div > br ~ div');//This keeps failing in the API
const res = await frame.evaluate(() => {
const rows = [...document.querySelectorAll('#summary > div > div > br ~ div')];
const cells = rows.map(
row => [...row.querySelectorAll('div')]
.map(cell => cell.innerText)
);
return cells;
});
await browser.close();
console.log(success("Browser Closed"));
const mapFields = (arr1, arr2) => {
const mappedArray = arr2.map((el) => {
const mappedArrayEl = {};
el.forEach((value, i) => {
if (arr1.length < (i+1)) return;
mappedArrayEl[arr1[i]] = value;
});
return mappedArrayEl;
});
return mappedArray;
}
const Arr1 = res[0];
const Arr2 = res.slice(1,3);
let dataObj = {};
dataObj[address] = [];
// dataObj['lagezuschlag'] = mapFields(Arr1, Arr2);
// dataObj['adresse'] = address;
dataObj[address] = mapFields(Arr1, Arr2);
console.log(dataObj);
} catch (err) {
// Catch and display errors
console.log(error(err));
await browser.close();
console.log(error("Browser Closed"));
}
})();
I just can't understand why it would work in the one case and not in the other, even though I barely changed something. For the API I basically changed the name of the async function to const search = async (address) => { such that I can call it with the query in my server side script.
Thanks in advance - I'm not attaching the API code cause I don't want to clutter the question. I can update it if it's necessary
I solved this myself. Turns out the problem wasn't as complicated as I thought and it was annoyingly simple to solve. The problem wasn't with the selector that was timing out but with the previous selectors, specifically the typing and choosing from dropdown selectors. Essentially, things were going too fast. Before the search query was typed in, the dropdown was already pressed and nonsense came out. How I solved it: I included a waitFor(1000) call before the dropdown is selected and everything went perfectly. An interesting realisation was that even though that one selector timed out, it wasn't actually the source of the problem. But like I said, annoyingly simple and I feel dumb for asking this :) but maybe someone will see this and learn from my mistake

How to get puppeteer to simply load a web page?

I can't get puppeteer to do anything. I'm simply trying to get it to show google.com and I can't even get it to do that. Here's my code:
console.log('Loading puppeteer...');
const puppeteer = require('puppeteer');
async function test() {
console.log('Launching browser...');
const browser = await puppeteer.launch({headless: false});
console.log('Creating new page...');
const page = await browser.newPage();
console.log('Requesting url...');
await page.goto('https://www.google.com');
console.log('Closing browser...');
await browser.close();
}
test().catch(e=>{console.log(e)});
Chromium crashes every single time I try do do anything...
Then I get a timeout error:
Loading puppeteer...
Launching browser...
TimeoutError: waiting for target failed: timeout 30000ms exceeded
...
...
I've been searching for a solution for literally weeks. Does this thing just not work anymore?
After looking at this thread, which identifies this as a well-known issue with Puppeteer, here is some more information on Puppeteer timeout problems.
Puppeteer.launch() has two parts that can cause timeout problems. One is goto timing out, and the other is waitfor timing out. Since I don't know what could be causing your specific issue, I'll give you potential solutions for both.
Possible issue #1: Goto is timing out.
I'll directly quote the person who posted this solution, rudiedirkx:
In my case the goto timeout happens because of a forever-loading blocking resource (js or css). That'll never trigger the page's load or domcontentloaded. A bug in Puppeteer IMO, but whatever.
My fix (FINALLY!) is to do what Lighthouse does in its Driver: a Promise.race() for a custom 'timeout'-ish. The shorter version I used:
const LOAD_FAIL = Math.random();
const sleep = options => new Promise(resolve => {
options.timer = setTimeout(resolve, options.ms, options.result === undefined ? true : options.result);
});
const sleepOptions = {ms: TIMEOUT - 1000, result: LOAD_FAIL};
const response = await Promise.race([
sleep(sleepOptions),
page.goto(url, {timeout: TIMEOUT + 1000}),
]);
clearTimeout(sleepOptions.timer);
const success = response !== LOAD_FAIL;
Possible issue #2: Waitfor is timing out.
Alternatively you can try the solution to a waitfor timeout given by dealeros, adding --enable-blink-features=HTMLImports in args:
browser = await puppeteer.launch({
//headless: false,
'args': [
'--enable-blink-features=HTMLImports'
]
});
If neither of those worked
If neither of these solutions work, I recommend browsing that thread to find more solutions people have suggested and see if you can narrow down the problem. Use this code to generate some console logs and see if you can find what's going wrong:
page
.on('console', message =>
console.log(`${message.type().substr(0, 3).toUpperCase()} ${message.text()}`))
.on('pageerror', ({ message }) => console.log(message))
.on('response', response =>
console.log(`${response.status()} ${response.url()}`))
.on('requestfailed', request =>
console.log(`${request.failure().errorText} ${request.url()}`));
These options both resolved the issue for me:
Kill all Chromium processes
pkill -o chromium
Reinstall node packages (if step 1 doesn't help)
rm -rf node_modules
npm install

Google Cloud Platform - Optimise Cloud Function using puppeteer (Node.js)

I have written a function in node.js that works well when I run it locally (~10s to run).
As I want to run it every hour, I have deployed it on Google Cloud Platform. But, there, I always have a TimeOut error.
Therefore, do you have any advice on:
what I should change in my function to make it more efficient?
a alternate way to automate my function so it runs every hour?
FYI my cloud function has the following characteristics:
Node js 8
Memory: 2Go
Timeout: 540 seconds
and the following form:
exports.launchSearch = (req, res) => {
const puppeteer = require('puppeteer');
const url = require('./pageInformation').url;
const pageLocation = require('./pageInformation').location;
const userInformation = require('./userInformation').information;
(async () => {
const browser = await puppeteer.launch({args: ['--no-sandbox']});
const page = await browser.newPage();
await page.goto(url);
// Part 1
await page.click(pageLocation['...']);
await page.type(pageLocation['...'], userInformation['...']);
await page.waitFor(pageLocation['...']);
await page.click(pageLocation['...']);
... ~20 other "page.click" or "page.select"
// Part 2
var continueLoop = true;
while (continueLoop) {
var list = await page.$x(pageLocation['...']);
if (list.length > 0) {
await list[0].click();
var found = true;
var continueLoop = false;
} else {
var afficher = await page.$x(pageLocation['...']);
if (afficher.length > 0) {
await afficher[0].click();
} else {
var continueLoop = false;
var found = false;
};
};
};
// Part 3
if (found) {
await page.waitForXPath(pageLocation['...']);
const xxx = await page.$x(pageLocation['...']);
await xxx[0].click();
... 5 other blocks with exact same 3 lines, but with other elements to click
};
await browser.close();
})();
};
I have tried to run it part by part; sometimes it times out at the end of Part 1, sometimes at the end of Part 2. But the whole script never entirely completed.
Without having too much context of what your code does, it is hard to point out the root cause, but that I tell is continue debugging your code as Horatio suggested, or you can use a more sophisticated tool like StackDriver to monitoring the performance of your Cloud Functions. Evaluate its pricing if you are interested in.
If Stackdriver is an overkill, simply make use of inline function wrapping to find out the exact place of your routine that consuming all that time. Here is an example:
var start = process.hrtime();
yourfunction();
var elapsed = process.hrtime(start)[1] / 1000000;
console.log("Elapsed:" + elapsed.toFixed(3));
Once you have the exact piece of code that is affecting the execution, then you probably may have to optimize it. Additionally, as I understand that locally it worked perfectly, consider that sometimes processes running in Cloud environment are affected by latency due the 'proximity' of the other resources they consume.
Regarding your second question, about automating your function to be executed every hour. You can take advantage of Cloud Scheduler. It has the capability to make scheduled calls to HTTP/HTTPS endpoints, which Cloud Functions classify as one of those. Make sure to check its pricing also.

How to avoid being detected as bot on Puppeteer and Phantomjs?

Puppeteer and PhantomJS are similar. The issue I'm having is happening for both, and the code is also similar.
I'd like to catch some informations from a website, which needs authentication for viewing those informations. I can't even access home page because it's detected like a "suspicious activity", like the SS: https://i.imgur.com/p69OIjO.png
I discovered that the problem doesn't happen when I tested on Postman using a header named Cookie and the value of it's cookie caught on browser, but this cookie expires after some time. So I guess Puppeteer/PhantomJS both are not catching cookies, because this site is denying the headless browser access.
What could I do for bypass this?
// Simple Javascript example
var page = require('webpage').create();
var url = 'https://www.expertflyer.com';
page.open(url, function (status) {
if( status === "success") {
page.render("home.png");
phantom.exit();
}
});
If anyone need in future for the same problem.
Using puppeteer-extra
I have tested the code on a server. On 2nd run there is google Captcha. You can solve it your self and restart the bot or use a Captcha solving service.
I did run the code more than 10 times there is no ip ban. I did not get captcha again on my continues run.
But you can get captcha again!
//sudo npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-adblocker readline
var headless_mode = process.argv[2]
const readline = require('readline');
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
async function run () {
const browser = await puppeteer.launch({
headless:(headless_mode !== 'true')? false : true,
ignoreHTTPSErrors: true,
slowMo: 0,
args: ['--window-size=1400,900',
'--remote-debugging-port=9222',
"--remote-debugging-address=0.0.0.0", // You know what your doing?
'--disable-gpu', "--disable-features=IsolateOrigins,site-per-process", '--blink-settings=imagesEnabled=true'
]})
const page = await browser.newPage();
console.log(`Testing expertflyer.com`)
//await page.goto('https://www.expertflyer.com')
await goto_Page('https://www.expertflyer.com')
await waitForNetworkIdle(page, 3000, 0)
//await page.waitFor(7000)
await checking_error(do_2nd_part)
async function do_2nd_part(){
try{await page.click('#yui-gen2 > a')}catch{}
await page.waitFor(5000)
var seat = '#headerTitleContainer > h1'
try{console.log(await page.$eval(seat, e => e.innerText))}catch{}
await page.screenshot({ path: 'expertflyer1.png'})
await checking_error(do_3nd_part)
}
async function do_3nd_part(){
try{await page.click('#yui-gen1 > a')}catch{}
await page.waitFor(5000)
var pro = '#headerTitleContainer > h1'
try{console.log(await page.$eval(pro, e => e.innerText))}catch{}
await page.screenshot({ path: 'expertflyer2.png'})
console.log(`All done, check the screenshots?`)
}
async function checking_error(callback){
try{
try{var error_found = await page.evaluate(() => document.querySelectorAll('a[class="text yuimenubaritemlabel"]').length)}catch(error){console.log(`catch error ${error}`)}
if (error_found === 0) {
console.log(`Error found`)
var captcha_msg = "Due to suspicious activity from your computer, we have blocked your access to ExpertFlyer. After completing the CAPTCHA below, you will immediately regain access unless further suspicious behavior is detected."
var ip_blocked = "Due to recent suspicious activity from your computer, we have blocked your access to ExpertFlyer. If you feel this block is in error, please contact us using the form below."
try{var error_msg = await page.$eval('h2', e => e.innerText)}catch{}
try{var error_msg_details = await page.$eval('body > p:nth-child(2)', e => e.innerText)}catch{}
if (error_msg_details == captcha_msg) {
console.log(`Google Captcha found, You have to solve the captch here manually or some automation recaptcha service`)
await verify_User_answer()
await callback()
} else if (error_msg_details == ip_blocked) {
console.log(`The current ip address is blocked. The only way is change the ip address.`)
} else {
console.log(`Waiting for error page load... Waiting for 10 sec before rechecking...`)
await page.waitFor(10000)
await checking_error()
}
} else {
console.log(`Page loaded successfully! You can do things here.`)
await callback()
}
}catch{}
}
async function goto_Page(page_URL){
try{
await page.goto(page_URL, { waitUntil: 'networkidle2', timeout: 30000 });
} catch {
console.log(`Error in loading page, re-trying...`)
await goto_Page(page_URL)
}
}
async function verify_User_answer(call_back){
user_Answer = await readLine();
if (user_Answer == 'yes') {
console.log(`user_Answer is ${user_Answer}, Processing...`)
// Not working what i want. Will fix later
// Have to restart the bot after solving
await call_back()
} else {
console.log(`answer not match. try again...`)
var user_Answer = await readLine();
console.log(`user_Answer is ${user_Answer}`)
await verify_User_answer(call_back)
}
}
async function readLine() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
return new Promise(resolve => {
rl.question('Solve the captcha and type yes to continue: ', (answer) => {
rl.close();
resolve(answer)
});
})
}
async function waitForNetworkIdle(page, timeout, maxInflightRequests = 0) {
console.log('waitForNetworkIdle called')
page.on('request', onRequestStarted);
page.on('requestfinished', onRequestFinished);
page.on('requestfailed', onRequestFinished);
let inflight = 0;
let fulfill;
let promise = new Promise(x => fulfill = x);
let timeoutId = setTimeout(onTimeoutDone, timeout);
return promise;
function onTimeoutDone() {
page.removeListener('request', onRequestStarted);
page.removeListener('requestfinished', onRequestFinished);
page.removeListener('requestfailed', onRequestFinished);
fulfill();
}
function onRequestStarted() {
++inflight;
if (inflight > maxInflightRequests)
clearTimeout(timeoutId);
}
function onRequestFinished() {
if (inflight === 0)
return;
--inflight;
if (inflight === maxInflightRequests)
timeoutId = setTimeout(onTimeoutDone, timeout);
}
}
await browser.close()
}
run();
Please note "Solve the captcha and type yes to continue: " method not working as expected, Need some fixing.
Edit: Re-run the bot after 10 minutes got captcha again. Solved captcha on chrome://inspect/#devices restarted the bot, everything working again. No ip ban.
Things that can help in general :
Headers should be similar to common browsers, including :
User-Agent : use a recent one (see https://developers.whatismybrowser.com/useragents/explore/), or better, use a random recent one if you make multiple requests (see https://github.com/skratchdot/random-useragent)
Accept-Language : something like "en,en-US;q=0,5" (adapt for your language)
Accept: a standard one would be like "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8"
If you make multiple request, put a random timeout between them
If you open links found in a page, set the Referer header accordingly
Images should be enabled
Javascript should be enabled
Check that "navigator.plugins" and "navigator.language" are set in the client javascript page context
Use proxies
If you think from the websites perspective, you are indeed doing suspicious work. So whenever you want to bypass something like this, make sure to think how they are thinking.
Set cookie properly
Puppeteer and PhantomJS etc will use real browsers and the cookies used there are better than when using via postman or such. You just need to use cookie properly.
You can use page.setCookie(...cookies) to set the cookies. Cookies are serialized, so if cookies is an array of object, you can simply do this,
const cookies = [{name: 'test', value: 'foo'}, {name: 'test2', value: 'foo'}]; // just as example, use real cookies here;
await page.setCookie(...cookies);
Try to tweak the behaviors
Turn off the headless mode and see the behavior of the website.
await puppeteer.launch({headless: false})
Try proxies
Some websites monitor based on Ip address, if multiple hits are from same IP, they blocks the request. It's best to use rotating proxies on that case.
The website you are trying to visit uses Distil Networks to prevent web scraping.
People have had success in the past bypassing Distil Networks by substituting the $cdc_ variable found in Chromium's call_function.js (which is used in Puppeteer).
For example:
function getPageCache(opt_doc, opt_w3c) {
var doc = opt_doc || document;
var w3c = opt_w3c || false;
// var key = '$cdc_asdjflasutopfhvcZLmcfl_'; <-- This is the line that is changed.
var key = '$something_different_';
if (w3c) {
if (!(key in doc))
doc[key] = new CacheWithUUID();
return doc[key];
} else {
if (!(key in doc))
doc[key] = new Cache();
return doc[key];
}
}
Note: According to this comment, if you have been blacklisted before you make this change, you face another set of challenges, so you must "implement fake canvas fingerprinting, disable flash, change IP, and change request header order (swap language and Accept headers)."

Resources