How to avoid being detected as bot on Puppeteer and Phantomjs?

How to avoid being detected as bot on Puppeteer and Phantomjs? - node.js

Puppeteer and PhantomJS are similar. The issue I'm having is happening for both, and the code is also similar.
I'd like to catch some informations from a website, which needs authentication for viewing those informations. I can't even access home page because it's detected like a "suspicious activity", like the SS: https://i.imgur.com/p69OIjO.png
I discovered that the problem doesn't happen when I tested on Postman using a header named Cookie and the value of it's cookie caught on browser, but this cookie expires after some time. So I guess Puppeteer/PhantomJS both are not catching cookies, because this site is denying the headless browser access.
What could I do for bypass this?
// Simple Javascript example
var page = require('webpage').create();
var url = 'https://www.expertflyer.com';
page.open(url, function (status) {
if( status === "success") {
page.render("home.png");
phantom.exit();
}
});

If anyone need in future for the same problem.
Using puppeteer-extra
I have tested the code on a server. On 2nd run there is google Captcha. You can solve it your self and restart the bot or use a Captcha solving service.
I did run the code more than 10 times there is no ip ban. I did not get captcha again on my continues run.
But you can get captcha again!
//sudo npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-adblocker readline
var headless_mode = process.argv[2]
const readline = require('readline');
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
async function run () {
const browser = await puppeteer.launch({
headless:(headless_mode !== 'true')? false : true,
ignoreHTTPSErrors: true,
slowMo: 0,
args: ['--window-size=1400,900',
'--remote-debugging-port=9222',
"--remote-debugging-address=0.0.0.0", // You know what your doing?
'--disable-gpu', "--disable-features=IsolateOrigins,site-per-process", '--blink-settings=imagesEnabled=true'
]})
const page = await browser.newPage();
console.log(`Testing expertflyer.com`)
//await page.goto('https://www.expertflyer.com')
await goto_Page('https://www.expertflyer.com')
await waitForNetworkIdle(page, 3000, 0)
//await page.waitFor(7000)
await checking_error(do_2nd_part)
async function do_2nd_part(){
try{await page.click('#yui-gen2 > a')}catch{}
await page.waitFor(5000)
var seat = '#headerTitleContainer > h1'
try{console.log(await page.$eval(seat, e => e.innerText))}catch{}
await page.screenshot({ path: 'expertflyer1.png'})
await checking_error(do_3nd_part)
}
async function do_3nd_part(){
try{await page.click('#yui-gen1 > a')}catch{}
await page.waitFor(5000)
var pro = '#headerTitleContainer > h1'
try{console.log(await page.$eval(pro, e => e.innerText))}catch{}
await page.screenshot({ path: 'expertflyer2.png'})
console.log(`All done, check the screenshots?`)
}
async function checking_error(callback){
try{
try{var error_found = await page.evaluate(() => document.querySelectorAll('a[class="text yuimenubaritemlabel"]').length)}catch(error){console.log(`catch error ${error}`)}
if (error_found === 0) {
console.log(`Error found`)
var captcha_msg = "Due to suspicious activity from your computer, we have blocked your access to ExpertFlyer. After completing the CAPTCHA below, you will immediately regain access unless further suspicious behavior is detected."
var ip_blocked = "Due to recent suspicious activity from your computer, we have blocked your access to ExpertFlyer. If you feel this block is in error, please contact us using the form below."
try{var error_msg = await page.$eval('h2', e => e.innerText)}catch{}
try{var error_msg_details = await page.$eval('body > p:nth-child(2)', e => e.innerText)}catch{}
if (error_msg_details == captcha_msg) {
console.log(`Google Captcha found, You have to solve the captch here manually or some automation recaptcha service`)
await verify_User_answer()
await callback()
} else if (error_msg_details == ip_blocked) {
console.log(`The current ip address is blocked. The only way is change the ip address.`)
} else {
console.log(`Waiting for error page load... Waiting for 10 sec before rechecking...`)
await page.waitFor(10000)
await checking_error()
}
} else {
console.log(`Page loaded successfully! You can do things here.`)
await callback()
}
}catch{}
}
async function goto_Page(page_URL){
try{
await page.goto(page_URL, { waitUntil: 'networkidle2', timeout: 30000 });
} catch {
console.log(`Error in loading page, re-trying...`)
await goto_Page(page_URL)
}
}
async function verify_User_answer(call_back){
user_Answer = await readLine();
if (user_Answer == 'yes') {
console.log(`user_Answer is ${user_Answer}, Processing...`)
// Not working what i want. Will fix later
// Have to restart the bot after solving
await call_back()
} else {
console.log(`answer not match. try again...`)
var user_Answer = await readLine();
console.log(`user_Answer is ${user_Answer}`)
await verify_User_answer(call_back)
}
}
async function readLine() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
return new Promise(resolve => {
rl.question('Solve the captcha and type yes to continue: ', (answer) => {
rl.close();
resolve(answer)
});
})
}
async function waitForNetworkIdle(page, timeout, maxInflightRequests = 0) {
console.log('waitForNetworkIdle called')
page.on('request', onRequestStarted);
page.on('requestfinished', onRequestFinished);
page.on('requestfailed', onRequestFinished);
let inflight = 0;
let fulfill;
let promise = new Promise(x => fulfill = x);
let timeoutId = setTimeout(onTimeoutDone, timeout);
return promise;
function onTimeoutDone() {
page.removeListener('request', onRequestStarted);
page.removeListener('requestfinished', onRequestFinished);
page.removeListener('requestfailed', onRequestFinished);
fulfill();
}
function onRequestStarted() {
++inflight;
if (inflight > maxInflightRequests)
clearTimeout(timeoutId);
}
function onRequestFinished() {
if (inflight === 0)
return;
--inflight;
if (inflight === maxInflightRequests)
timeoutId = setTimeout(onTimeoutDone, timeout);
}
}
await browser.close()
}
run();
Please note "Solve the captcha and type yes to continue: " method not working as expected, Need some fixing.
Edit: Re-run the bot after 10 minutes got captcha again. Solved captcha on chrome://inspect/#devices restarted the bot, everything working again. No ip ban.

Things that can help in general :
Headers should be similar to common browsers, including :
User-Agent : use a recent one (see https://developers.whatismybrowser.com/useragents/explore/), or better, use a random recent one if you make multiple requests (see https://github.com/skratchdot/random-useragent)
Accept-Language : something like "en,en-US;q=0,5" (adapt for your language)
Accept: a standard one would be like "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8"
If you make multiple request, put a random timeout between them
If you open links found in a page, set the Referer header accordingly
Images should be enabled
Javascript should be enabled
Check that "navigator.plugins" and "navigator.language" are set in the client javascript page context
Use proxies

If you think from the websites perspective, you are indeed doing suspicious work. So whenever you want to bypass something like this, make sure to think how they are thinking.
Set cookie properly
Puppeteer and PhantomJS etc will use real browsers and the cookies used there are better than when using via postman or such. You just need to use cookie properly.
You can use page.setCookie(...cookies) to set the cookies. Cookies are serialized, so if cookies is an array of object, you can simply do this,
const cookies = [{name: 'test', value: 'foo'}, {name: 'test2', value: 'foo'}]; // just as example, use real cookies here;
await page.setCookie(...cookies);
Try to tweak the behaviors
Turn off the headless mode and see the behavior of the website.
await puppeteer.launch({headless: false})
Try proxies
Some websites monitor based on Ip address, if multiple hits are from same IP, they blocks the request. It's best to use rotating proxies on that case.

The website you are trying to visit uses Distil Networks to prevent web scraping.
People have had success in the past bypassing Distil Networks by substituting the $cdc_ variable found in Chromium's call_function.js (which is used in Puppeteer).
For example:
function getPageCache(opt_doc, opt_w3c) {
var doc = opt_doc || document;
var w3c = opt_w3c || false;
// var key = '$cdc_asdjflasutopfhvcZLmcfl_'; <-- This is the line that is changed.
var key = '$something_different_';
if (w3c) {
if (!(key in doc))
doc[key] = new CacheWithUUID();
return doc[key];
} else {
if (!(key in doc))
doc[key] = new Cache();
return doc[key];
}
}
Note: According to this comment, if you have been blacklisted before you make this change, you face another set of challenges, so you must "implement fake canvas fingerprinting, disable flash, change IP, and change request header order (swap language and Accept headers)."

Related

How to access page.getby* functions inside crawlee

I'm using crawlee with PlaywrightCrawler. I'm getting a new url to crawl after clicking a few elements in the starting page. The way that i'm clicking those elements is using
page.getByRole().click(), which codegen playwright used it:
import { chromium } from "#playwright/test";
const browser = await chromium.launch({
headless: true,
});
const context = await browser.newContext();
const page = await context.newPage();
for (let i = 0; i < brandSection.length; i++) {
let [brandName, brandCount] = brandSection[i].split("\n");
await page
.getByRole("button", { name: `${brandName} ${brandCount}` })
.click();
}
So this works without crawlee, but when I try to use it inside a PlaywrightCrawler, It fails saying that the page instance doesn't have a method called .getByRole().
import { createPlaywrightRouter, enqueueLinks, Dataset } from "crawlee";
import { PlaywrightCrawler } from "crawlee";
....
....
router.addDefaultHandler(async ({ page, request, enqueueLinks }) => {
const prodGridSel = ".catalog-grid a";
//**here goes the code that uses .getByRole()**//
await enqueueLinks({
...
},
});
...
...
const crawler = new PlaywrightCrawler({
requestHandler: router,
});
I haven't used playwright for testing, only for crawling with crawlee, so I'm guessing the getby*() functions are available when using "#playwright/test". I didn't found any information except this, which is related to cypress and probably a faulty import.
So, can I have a page instance inside crawlee that has these functions?

Web3 BatchRequest always returning undefined, what am I doing wrong?

I'm trying to use the web3 Batch in order to call token balances all together. When I call batch.execute() it returns undefined instead of the resolved requests that have been added to the batch.
Can someone enlighten me where I am messing things up?
Here is my code.
async generateContractFunctionList(
address: Address,
tokens: Token[],
blockNumber: number
) {
const batch = new this.web3.BatchRequest();
for (let i = 0; i < tokens.length; i++) {
const contract = new this.web3.eth.Contract(balanceABI as AbiItem[]);
contract.options.address = tokens[i].address;
batch.add(
contract.methods
.balanceOf(address.address)
.call.request({}, blockNumber)
);
}
return batch;
}
async updateBalances() {
try {
const addresses = await this.addressService.find();
const tokens = await this.tokenService.find();
const blockNumber = await this.web3.eth.getBlockNumber();
for (let i = 0; i < addresses.length; i++) {
const address = addresses[i];
const batch = this.generateContractFunctionList(address, tokens, blockNumber);
const response = await (await batch).execute();
console.log(response); //returns undefined
}
} catch (error: unknown) {
if (error instanceof Error) {
console.log(`UpdateBalanceService updateBalances`, error.message);
}
}
}
why does batch.execute() not return anything and is void? I went by this example from this article and modified it to my needs but did not change too much of the stuff that could be messing it up.
https://chainstack.com/the-ultimate-guide-to-getting-multiple-token-balances-on-ethereum/
when I add a callback function to the "batch.add" and console log, balance get logged to console. But I am trying to use async await on the .execute() so how can I get a result from the method calls with await batch.execute() and have all the callback results in there like its written in the blog post.

A quick and dirty solution is to use an outdated version of the package.
package.json:
....
"dependencies": {
...
"web3": "^2.0.0-alpha.1",
...
}
....

I'm a developer advocate at Chainstack.
batch.add()requires a callback function as the last parameter. You can either change your code to pass a callback or, as mentioned above use version 2.0.0-alpha as used in the article.
We'll update the article soon to use the latest version of web3.js

waitForSelector suddenly no longer working in puppeteer

I have a working puppeteer script that I'd like to make into an API but I'm having problems with waitForSelector.
Background:
I wrote a puppeteer script that successfully searches for and scrapes the result of a query I specify in the code e.g. let address = xyz;. Now I'd like to make it into an API so that a user can query something. I managed to code everything necessary for the local API (working with express) and everything works as well. By that I mean: I coded all the server side stuff: I can make a request, the scraper function is called, puppeteer starts up, carries out my search (I need to type in an address, choose from a dropdown and press enter).
The status:
The result of my query is a form (basically 3 columns and some rows) in an iFrame and I want to scrape all the rows (I modify them into a specific json later on). The way it works is I use waitForSelector on the form's selector and then I use frame.evaluate.
Problem:
When I run my normal scraper everything works well, but when I run the (slightly modified but essentially same) code within the API framework, waitForSelector suddenly always times out. I have tried all the usual workarounds: waitForNavigation, taking a screenshot and inspecting etc but nothing helped. I've been reading quite a bit and could it be that I'm screwing something up in terms of async/await when I call my scraper from within the context of the API? I'm still quite new to this so please bear with me. This is the code of the working script - I indicated the important part
const puppeteer = require("puppeteer");
const chalk = require("chalk");
const fs = require('fs');
const error = chalk.bold.red;
const success = chalk.keyword("green");
address = 'Gumpendorfer Straße 12, 1060 Wien';
(async () => {
try {
// open the headless browser
var browser = await puppeteer.launch();
// open a new page
var page = await browser.newPage();
// enter url in page
await page.goto(`https://mein.wien.gv.at/Meine-Amtswege/richtwert?subpage=/lagezuschlag/`, {waitUntil: 'networkidle2'});
// continue without newsletter
await page.click('#dss-modal-firstvisit-form > button.btn.btn-block.btn-light');
// let everyhting load
await page.waitFor(1000)
console.log('waiting for iframe with form to be ready.');
//wait until selector is available
await page.waitForSelector('iframe');
console.log('iframe is ready. Loading iframe content');
//choose the relevant iframe
const elementHandle = await page.$(
'iframe[src="/richtwertfrontend/lagezuschlag/"]',
);
//go into frame in order to input info
const frame = await elementHandle.contentFrame();
//enter address
console.log('filling form in iframe');
await frame.type('#input_adresse', address, { delay: 100});
//choose first option from dropdown
console.log('Choosing from dropdown');
await frame.click('#react-autowhatever-1--item-0');
console.log('pressing button');
//press button to search
await frame.click('#next-button');
// scraping data
console.log('scraping')
await frame.waitForSelector('#summary > div > div > br ~ div');//This keeps failing in the API
const res = await frame.evaluate(() => {
const rows = [...document.querySelectorAll('#summary > div > div > br ~ div')];
const cells = rows.map(
row => [...row.querySelectorAll('div')]
.map(cell => cell.innerText)
);
return cells;
});
await browser.close();
console.log(success("Browser Closed"));
const mapFields = (arr1, arr2) => {
const mappedArray = arr2.map((el) => {
const mappedArrayEl = {};
el.forEach((value, i) => {
if (arr1.length < (i+1)) return;
mappedArrayEl[arr1[i]] = value;
});
return mappedArrayEl;
});
return mappedArray;
}
const Arr1 = res[0];
const Arr2 = res.slice(1,3);
let dataObj = {};
dataObj[address] = [];
// dataObj['lagezuschlag'] = mapFields(Arr1, Arr2);
// dataObj['adresse'] = address;
dataObj[address] = mapFields(Arr1, Arr2);
console.log(dataObj);
} catch (err) {
// Catch and display errors
console.log(error(err));
await browser.close();
console.log(error("Browser Closed"));
}
})();
I just can't understand why it would work in the one case and not in the other, even though I barely changed something. For the API I basically changed the name of the async function to const search = async (address) => { such that I can call it with the query in my server side script.
Thanks in advance - I'm not attaching the API code cause I don't want to clutter the question. I can update it if it's necessary

I solved this myself. Turns out the problem wasn't as complicated as I thought and it was annoyingly simple to solve. The problem wasn't with the selector that was timing out but with the previous selectors, specifically the typing and choosing from dropdown selectors. Essentially, things were going too fast. Before the search query was typed in, the dropdown was already pressed and nonsense came out. How I solved it: I included a waitFor(1000) call before the dropdown is selected and everything went perfectly. An interesting realisation was that even though that one selector timed out, it wasn't actually the source of the problem. But like I said, annoyingly simple and I feel dumb for asking this :) but maybe someone will see this and learn from my mistake

Why cant Puppeteer find this link element on page?

^^UPDATE^^
Willing to pay someone to walk me through this, issue posted on codeMentor.io: https://www.codementor.io/u/dashboard/my-requests/9j42b83f0p
I've been looking to click on the element:
<a id="isc_LinkItem_1$20j" href="javascript:void" target="javascript" tabindex="2"
onclick="if(window.isc_LinkItem_1) return isc_LinkItem_1.$30i(event);"
$9a="$9d">Reporting</a>
In: https://stackblitz.com/edit/js-nzhhbk
(I haven't included the acutal page because its behind a username & pass)
seems easy enough
----------------------------------------------------------------------
solution1:
page.click('[id=isc_LinkItem_1$20j]') //not a valid selector
solution2:
const linkHandlers = await frame.$x("//a[contains(text(), 'Reporting')]");
if (linkHandlers.length > 0) {
await linkHandlers[0].click();
} else {
throw new Error('Link not found');
} //link not found
----------------------------------------------------------------------
I have looked at every which way to select and click it and it says it isn't in the document even though it clearly is (verified by inspecting the html in chrome dev tools and calling:page.evaluate(() => document.body.innerHTML))
**tried to see if it was in an iframe
**tried to select by id
**tried to select by inner text
**tried to console log the body in the browser (console logging not working verified on the inspect _element) //nothing happens
**tried to create an alert with body text by using: _evaluate(() => alert(document)) // nothing happens
**tried to create an alert to test to see if javascript can be injected by: _evaluate(() => alert('works')) // nothing happens
**also tried this: How to select elements within an iframe element in Puppeteer // doesn't work
Here is the code I have built so far
const page = await browser.newPage();
const login1url =
'https://np3.nextiva.com/NextOSPortal/ncp/landing/landing-platform';
await page.goto(login1url);
await page.waitFor(1000);
await page.type('[name=loginUserName', 'itsaSecretLol');
await page.type('[name=loginPassword]', 'nopeHaha');
await page.click('[type=submit]');
await page.waitForNavigation();
const login3url = 'https://np3.nextiva.com/NextOSPortal/ncp/admin/dashboard';
await page.goto(login3url);
await page.click('[id=hdr_users]');
await page.goto('https://np3.nextiva.com/NextOSPortal/ncp/user/manageUsers');
await page.goto('https://np3.nextiva.com/NextOSPortal/ncp/user/garrettmrg');
await page.waitFor(2000);
await page.click('[id=loginAsUser]');
await page.waitFor(2000);
await page.click('[id=react-select-5--value');
await page.waitFor(1000);
await page.click('[id=react-select-5--option-0]');
await page.waitFor(20000);
const elementHandle = await page.$('iframe[id=callcenter]');
const frame = await elementHandle.contentFrame();
const linkHandlers = await frame.$x("//a[contains(text(), 'Reporting')]");
if (linkHandlers.length > 0) {
await linkHandlers[0].click();
} else {
throw new Error('Link not found');
}

due isc_LinkItem_1$20j is not a valid selector, maybe you can try finding elements STARTING WITH isc_LinkItem_1 , like this
await page.waitForSelector("[id^=isc_LinkItem_1]", {visible: true, timeout: 30000});
await page.click("[id?=isc_LinkItem_1]);
?

On your solution1:
await page.click('a[id=isc_LinkItem_1\\$20j]');
Or try to:
await page.click('#isc_LinkItem_1\\$20j]');
I have the slight impression that you must provide what kind of element your trying to select before the brackets, in this case, an < a > element.
On the second solution, the # character means we're selecting an element by it's id

It turns out that the previous click triggered a new tab. Puppeteer doesn't move to the new tab, all previous code was being executed on the old tab. To fix all we had to do was find the new tab, select it and execute code, here is the function we wrote to select for the tab:
async function getTab(regex, browser, targets) {
let pages = await browser.pages();
if (targets) pages = await browser.targets();
let newPage;
for (let i = 0; i < pages.length; i++) {
const url = await pages[i].url();
console.log(url);
if (url.search(regex) !== -1) {
newPage = pages[i];
console.log('***');
console.log(url);
console.log('***');
break;
}
}
console.log('finished');
return newPage;
}

Nodejs/Puppeteer - Navigation timeout

I need help to undestand how timeout works, especially with node/puppeteer
I read all stack questions and github issues about this, but i can figure it out what is wrong
Probably my code...
When i run this file, i receive the error from image. You can see the ways i tryied to fix it, nothing works
Can someone explain why this happens and the best approach to avoid this? Is there a better way to get these Projects?
//vou até os seeds em x tempo
var https = require('https');
var Q = require('q');
var fs = require('fs');
var puppeteer = require('puppeteer');
var Projeto = require('./Projeto.js');
const url = 'https://www.99freelas.com.br/projects?categoria=web-e-desenvolvimento'
/*const idToScrape;
deverá receber qual a url e os parametros específicos de cada seed */
async function genScraper() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
//page.setDefaultNavigationTimeout(60000);
page.waitForNavigation( { timeout: 60000, waitUntil: 'domcontentloaded' });
await page.goto(url);
var projetos = await page.evaluate(() => {
let qtProjs = document.querySelectorAll('.result-list li').length;
let listaDeProjs = Array.from(document.querySelectorAll('.result-list li'));
let tempProjetos = [];
for( var i=0; i<=listaDeProjs.length; i++ ) {
let titulo = listaDeProjs[i].children[1].children[0].textContent;
let descricao = listaDeProjs[i].children[2].textContent;
let habilidades = listaDeProjs[i].children[3].textContent;
let publicado = listaDeProjs[i].children[1].children[1].children[0].textContent;
let tempoRestante = listaDeProjs[i].children[1].children[1].children[1].textContent;
//let infoCliente;
proj = new Projeto(titulo, descricao, habilidades, publicado, tempoRestante);
tempProjetos.push(proj);
}
return tempProjetos;
});
console.log(projetos);
browser.close();
}
genScraper();

I recommend you to avoid using the method waitForNavigation before the goTo call.
Basically, It would be better to use the method gotTo with the default value, that is 30000. In my opinion, if the website takes more than 30 seconds to work or respond, there should be something wrong.
Instead, I would do something like this:
await page.goto(url, {
waitUntil: 'networkidle0'
});
Depending on the version of puppeteer that you're using, you will have different behaviours. I am using version 1.4.0 and it is working good so far.
Inside the documentation states the following:
The page.goto will throw an error if:
there's an SSL error (e.g. in case of self-signed certificates).
target URL is invalid.
the timeout is exceeded during navigation.
the main resource failed to load.
So, check that none of the previous scenarios is happening.
Also, you can curl the URL from your terminal to see if the URL respond to outside calls, cross origin problems are common too.
Sincerely, there is no way to say what can be triggering your timeout, but that checklist should help. I had a problem with timeout recently and the problem was my server configuration, so I suggest you to see also if the machine in which you are running this code, has the necessary memory to execute.

In your for loop,
for( var i=0; i<=listaDeProjs; i++ ) {
...
}
listaDeProjs should be listaDeProjs.length
Your evaluation script will fail in several places, if anywhere along this path is undefined: (E.g., if children[1] is undefined or children[0] is undefined.)
listaDeProjs[i].children[1].children[0].textContent;
You can do the following with lodash:
_.get(listaDeProjs[i],"children[1].children[0].textContent","")
That will default to "" if there is no such value.
Additionally, the following works perfectly fine with your code in 1.7 via https://try-puppeteer.appspot.com/
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: '5000'
});

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How to avoid being detected as bot on Puppeteer and Phantomjs? - node.js

Related

How to access page.getby* functions inside crawlee

Web3 BatchRequest always returning undefined, what am I doing wrong?

waitForSelector suddenly no longer working in puppeteer

Why cant Puppeteer find this link element on page?

Nodejs/Puppeteer - Navigation timeout

Categories

Resources