selenium: bypass access denied - python-3.x

I'm trying to navigate a website with Selenium, but I'm getting an error: Access Denied. You do not have permission to access "http://tokopedia.com/" on this server.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
CHROMEDRIVER_PATH = r'C:/chromedriver.exe'
tokopedia = "https://tokopedia.com/"
options = Options()
options.add_argument("--headless")
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=options)
driver.get(tokopedia)
print(driver.page_source)
how to solve it? Thank you for the help

Try the below code. It is working for me -
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
tokopedia = "https://tokopedia.com/"
options = Options()
options.add_argument("--headless")
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))
driver = webdriver.Chrome(options=options)
driver.get(tokopedia)
print(driver.page_source)

Related

Why can't get the page source with headless browser using selenium?

I can get the page source with browser--chrome's head on.
vim get_with_head.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver",options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)
It works fine.
python3 get_with_head.py
The chrome opens the webpage,all content in the webpage showns ,now i add three lines to make it a headless browser :
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
The whole codes:
vim get_without_head.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver",options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)
It can't get the content on the webpage:
python3 get_without_head.py
<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
You don't have permission to access "http://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index" on this server.<p>
Reference #18.4660dc17.1631258672.2c70b7e3
</p></body></html>
Why can get all content with browser's head on instead of in headless status ?
Why?
Headless mode uses its own default User-Agent if it is not given as an argument. However some webpages may block Headless mode User-Agent to avoid unwanted traffic. It may result in Access denied error while trying to open a webpage.
An exemplary default User-Agent for headless mode:
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/60.0.3112.50 Safari/537.36
As you see, it explicitly shows that browser is running on Headless mode.
Solution:
Change the User-Agent option.
windows_useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
linux_useragent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
browser = webdriver.Chrome(options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)

Being not able to set multiple chrome options at the same time (blocking notifications and cookies) in selenium and python

the code is only including blocking notifications:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from time import sleep
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver=webdriver.Chrome(executable_path="C:\\Users\\Desktop\\chromedriver.exe",chrome_options=chrome_options)
driver.maximize_window()
driver.get("https://www.hurriyet.com.tr/")
sleep(5)
Hello friends, I can not be able to set multiple chrome options (blocking notifications and cookies) at the same time. How can I set the blocking notifications and the cookies at the same time? Is tehere any solution I want to learn. I think that I could use somehow these together but I couldn't. :
"prefs", {"profile.default_content_settings.cookies": 2} "prefs", {"profile.default_content_setting_values.notifications" : 2 }
Why not something like this :
executable_path = r"C:\\Users\\Selenium+Python\\chromedriver.exe"
options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 2})
options.add_experimental_option("prefs", {"profile.default_content_settings.cookies": 2})
options.add_argument("start-maximized")
driver = webdriver.Chrome(executable_path, options=options)

Selenium not able to create login account on target.com

When I try to create an account on target.com using selenium web driver, it's showing me this error "Sorry, something went wrong. Please try again." Whereas if I try to create an account in the same browser in a different tab, account get created, how do I create an account using selenium webdriver ?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome('C:/Users/priya/Desktop/project 14/chromedriver.exe',chrome_options=options)
driver.get('https://www.target.com/')
account = driver.find_element_by_id('account')
account.click()
time.sleep(1)
create = driver.find_element_by_id('accountNav-createAccount')
create.click()
time.sleep(4)
username = driver.find_element_by_id('username')
username.send_keys('abcdef#example.com')
fname = driver.find_element_by_id('firstname')
fname.send_keys('John')
lname = driver.find_element_by_id('lastname')
lname.send_keys('Kenny')
password = driver.find_element_by_id('password')
password.send_keys('Icecram12345')
submit = driver.find_element_by_id('createAccount')
submit.click()
driver.close().
link to error: [1]: https://i.stack.imgur.com/7ksIv.png

Setting up tor with selenium web driver. (Windows)

i have tried to set up my tor with selenium but it continuously throws up exceptions.
I have tried setting up the binary as well as profiles but no luck.
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
import os
torexe = os.popen(r'C:\Users\Jawad Ahmad Khan\Desktop\Tor Browser\Browser\firefox.exe')
profile = FirefoxProfile(r'C:\Users\Jawad Ahmad Khan\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default')
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.socks', '127.0.0.1')
profile.set_preference('network.proxy.socks_port', 9050)
profile.set_preference("network.proxy.socks_remote_dns", False)
profile.update_preferences()
driver = webdriver.Firefox(firefox_profile= profile,
executable_path=r'D:\geckodriver\geckodriver.exe')
driver.get("http://check.torproject.org")
This is the error message:
selenium.common.exceptions.WebDriverException: Message: Reached error page: about:neterror?e=proxyConnectFailure&u=https%3A//check.torproject.org/&c=UTF-8&f=regular&d=Firefox%20is%20configured%20to%20use%20a%20proxy%20server%20that%20is%20refusing%20connections.
This works on my Mac with Chrome with Tor.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_chrome_webdriver():
tor_proxy = "127.0.0.1:9150"
chrome_options = Options()
chrome_options.add_argument("--test-type")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument("--incognito")
chrome_options.add_argument('--proxy-server=socks5://%s' % tor_proxy)
driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)
return driver
def get_chrome_browser(url):
browser = get_chrome_webdriver()
browser.get(url)
return browser
get_chrome_browser('https://check.torproject.org/')

Python: Is it possible to download ENTIRE web page in PhantomJS

I have used PhantomJS for scraping purpose. I would like to know about possibility of download all contents of a URL(inclduing Images, CSS and JS) and save locally for browsing?
# -*- coding: utf-8 -*-
from selenium import webdriver #for cookies collections after all AJAX/JS being executed
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36")
driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any', '--web-security=false'])
driver.set_window_size(1366,768)
driver.get('http://stackoverflow.com')
driver.page_source
This is complete code that uses Python Selenium + PhantomJS and at the end you have complete page source.
we can use evaluate() function to get the content. I use this in nodejs.
var webPage = require('webpage');
var page = webPage.create();
page.open('http://google.com', function(status) {
var title = page.evaluate(function() {
return document.title;
});
console.log(title);
phantom.exit();
});`
In the case of wget being installed, this task is rather easy:
domain = "www.google.de"
from subprocess import call
call(["wget", "-mk", domain])

Resources