selenium: bypass access denied

selenium: bypass access denied - python-3.x

I'm trying to navigate a website with Selenium, but I'm getting an error: Access Denied. You do not have permission to access "http://tokopedia.com/" on this server.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
CHROMEDRIVER_PATH = r'C:/chromedriver.exe'
tokopedia = "https://tokopedia.com/"
options = Options()
options.add_argument("--headless")
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=options)
driver.get(tokopedia)
print(driver.page_source)
how to solve it? Thank you for the help

Try the below code. It is working for me -
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
tokopedia = "https://tokopedia.com/"
options = Options()
options.add_argument("--headless")
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))
driver = webdriver.Chrome(options=options)
driver.get(tokopedia)
print(driver.page_source)

Related

Why can't get the page source with headless browser using selenium?

I can get the page source with browser--chrome's head on.
vim get_with_head.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver",options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)
It works fine.
python3 get_with_head.py
The chrome opens the webpage,all content in the webpage showns ,now i add three lines to make it a headless browser :
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
The whole codes:
vim get_without_head.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver",options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)
It can't get the content on the webpage:
python3 get_without_head.py
<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
You don't have permission to access "http://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index" on this server.<p>
Reference #18.4660dc17.1631258672.2c70b7e3
</p></body></html>
Why can get all content with browser's head on instead of in headless status ?

Why?
Headless mode uses its own default User-Agent if it is not given as an argument. However some webpages may block Headless mode User-Agent to avoid unwanted traffic. It may result in Access denied error while trying to open a webpage.
An exemplary default User-Agent for headless mode:
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/60.0.3112.50 Safari/537.36
As you see, it explicitly shows that browser is running on Headless mode.
Solution:
Change the User-Agent option.
windows_useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
linux_useragent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
browser = webdriver.Chrome(options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)

Being not able to set multiple chrome options at the same time (blocking notifications and cookies) in selenium and python

the code is only including blocking notifications:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from time import sleep
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver=webdriver.Chrome(executable_path="C:\\Users\\Desktop\\chromedriver.exe",chrome_options=chrome_options)
driver.maximize_window()
driver.get("https://www.hurriyet.com.tr/")
sleep(5)
Hello friends, I can not be able to set multiple chrome options (blocking notifications and cookies) at the same time. How can I set the blocking notifications and the cookies at the same time? Is tehere any solution I want to learn. I think that I could use somehow these together but I couldn't. :
"prefs", {"profile.default_content_settings.cookies": 2} "prefs", {"profile.default_content_setting_values.notifications" : 2 }

Why not something like this :
executable_path = r"C:\\Users\\Selenium+Python\\chromedriver.exe"
options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})
options.add_experimental_option("prefs", {"profile.default_content_settings.cookies": 2})
options.add_argument("start-maximized")
driver = webdriver.Chrome(executable_path, options=options)

Selenium not able to create login account on target.com

When I try to create an account on target.com using selenium web driver, it's showing me this error "Sorry, something went wrong. Please try again." Whereas if I try to create an account in the same browser in a different tab, account get created, how do I create an account using selenium webdriver ?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome('C:/Users/priya/Desktop/project 14/chromedriver.exe',chrome_options=options)
driver.get('https://www.target.com/')
account = driver.find_element_by_id('account')
account.click()
time.sleep(1)
create = driver.find_element_by_id('accountNav-createAccount')
create.click()
time.sleep(4)
username = driver.find_element_by_id('username')
username.send_keys('abcdef#example.com')
fname = driver.find_element_by_id('firstname')
fname.send_keys('John')
lname = driver.find_element_by_id('lastname')
lname.send_keys('Kenny')
password = driver.find_element_by_id('password')
password.send_keys('Icecram12345')
submit = driver.find_element_by_id('createAccount')
submit.click()
driver.close().
link to error: [1]: https://i.stack.imgur.com/7ksIv.png

Setting up tor with selenium web driver. (Windows)

i have tried to set up my tor with selenium but it continuously throws up exceptions.
I have tried setting up the binary as well as profiles but no luck.
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
import os
torexe = os.popen(r'C:\Users\Jawad Ahmad Khan\Desktop\Tor Browser\Browser\firefox.exe')
profile = FirefoxProfile(r'C:\Users\Jawad Ahmad Khan\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default')
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.socks', '127.0.0.1')
profile.set_preference('network.proxy.socks_port', 9050)
profile.set_preference("network.proxy.socks_remote_dns", False)
profile.update_preferences()
driver = webdriver.Firefox(firefox_profile= profile,
executable_path=r'D:\geckodriver\geckodriver.exe')
driver.get("http://check.torproject.org")
This is the error message:
selenium.common.exceptions.WebDriverException: Message: Reached error page: about:neterror?e=proxyConnectFailure&u=https%3A//check.torproject.org/&c=UTF-8&f=regular&d=Firefox%20is%20configured%20to%20use%20a%20proxy%20server%20that%20is%20refusing%20connections.

This works on my Mac with Chrome with Tor.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_chrome_webdriver():
tor_proxy = "127.0.0.1:9150"
chrome_options = Options()
chrome_options.add_argument("--test-type")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument("--incognito")
chrome_options.add_argument('--proxy-server=socks5://%s' % tor_proxy)
driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)
return driver
def get_chrome_browser(url):
browser = get_chrome_webdriver()
browser.get(url)
return browser
get_chrome_browser('https://check.torproject.org/')

Python: Is it possible to download ENTIRE web page in PhantomJS

I have used PhantomJS for scraping purpose. I would like to know about possibility of download all contents of a URL(inclduing Images, CSS and JS) and save locally for browsing?

# -*- coding: utf-8 -*-
from selenium import webdriver #for cookies collections after all AJAX/JS being executed
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36")
driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false'])
driver.set_window_size(1366,768)
driver.get('http://stackoverflow.com')
driver.page_source
This is complete code that uses Python Selenium + PhantomJS and at the end you have complete page source.

we can use evaluate() function to get the content. I use this in nodejs.
var webPage = require('webpage');
var page = webPage.create();
page.open('http://google.com', function(status) {
var title = page.evaluate(function() {
return document.title;
});
console.log(title);
phantom.exit();
});`

In the case of wget being installed, this task is rather easy:
domain = "www.google.de"
from subprocess import call
call(["wget", "-mk", domain])

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

selenium: bypass access denied - python-3.x

Related

Why can't get the page source with headless browser using selenium?

Being not able to set multiple chrome options at the same time (blocking notifications and cookies) in selenium and python

Selenium not able to create login account on target.com

Setting up tor with selenium web driver. (Windows)

Python: Is it possible to download ENTIRE web page in PhantomJS

Categories

Resources