How to detect if selenium initializer driver is headless - python-3.x

Let's say I have this code
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument("window-size=1920,1080")
browser=webdriver.Chrome(options=options,executable_path=r"chromedriver.exe")
browser.execute_cdp_cmd('Network.setUserAgentOverride',
{"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
How can I check if the initialized browser is headless or not, programmatically? I mean, if I type
browser.get_window_size() I get {'width': 1920, 'height': 1080}, if I write browser.execute_script('return navigator.languages') it returns ['en-US', 'en']
What I'm looking for is something like browser.is_headless() where I can get if a given browser is headless or not.

options = webdriver.ChromeOptions()
options.headless
Will return True, if --headless argument is set into ChomeOptions(), otherwise, will return False.

Based on the official Selenium documentation
options.headless
should return whether headless is set or not

If you're using Firefox (tested on Firefox 106):
if driver.caps.get("moz:headless", False):
print("Firefox is headless")

Related

Aiohttp+Asyncio Seems To Be Inconsistent in Tripadvisor Travel Site

I was trying to asynchronously request page data from tripadvisor travel site using aiohttp+asyncio, but it seems that in multiple occasions, the get() method is stuck for almost a minute and then results in TimeoutError.
I created a similar script using the requests library and confirmed that there are times that the code with requests library works while the code with aiohttp+asyncio does not.
Here are the codes:
Using aiohttp + asyncio
from aiohttp import ClientSession
import asyncio
home_url = 'https://www.tripadvisor.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/93.0.4577.63 Safari/537.36'
}
async def main():
async with ClientSession(headers=headers) as session:
tourist_sites_url = home_url + '/Attractions-g294245-Activities-a_allAttractions.true-Philippines.html'
async with session.get(tourist_sites_url) as response:
print(f'{response.status=}\n')
print(await response.text())
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Using requests
from requests import Session
home_url = 'https://www.tripadvisor.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/93.0.4577.63 Safari/537.36'
}
def main():
with Session() as session:
tourist_sites_url = home_url + '/Attractions-g294245-Activities-a_allAttractions.true-Philippines.html'
response = session.get(tourist_sites_url, headers=headers)
print(f'{response.status_code=}\n')
print(response.text)
if __name__ == '__main__':
main()
What shall I do in order for the code with aiohttp+asyncio to work on tripadvisor website?
Thank you very much!

Why can't get the page source with headless browser using selenium?

I can get the page source with browser--chrome's head on.
vim get_with_head.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver",options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)
It works fine.
python3 get_with_head.py
The chrome opens the webpage,all content in the webpage showns ,now i add three lines to make it a headless browser :
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
The whole codes:
vim get_without_head.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(executable_path="/usr/bin/chromedriver",options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)
It can't get the content on the webpage:
python3 get_without_head.py
<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
You don't have permission to access "http://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index" on this server.<p>
Reference #18.4660dc17.1631258672.2c70b7e3
</p></body></html>
Why can get all content with browser's head on instead of in headless status ?
Why?
Headless mode uses its own default User-Agent if it is not given as an argument. However some webpages may block Headless mode User-Agent to avoid unwanted traffic. It may result in Access denied error while trying to open a webpage.
An exemplary default User-Agent for headless mode:
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/60.0.3112.50 Safari/537.36
As you see, it explicitly shows that browser is running on Headless mode.
Solution:
Change the User-Agent option.
windows_useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
linux_useragent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
browser = webdriver.Chrome(options=chrome_options)
browser.maximize_window()
wait = WebDriverWait(browser, 40)
url="https://www.nasdaq.com/market-activity/quotes/nasdaq-ndx-index"
browser.get(url)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
print(browser.page_source)

BeautifulSoup Python web scraping Missing html Main Body

i am using Beutifull soup to scrape this web page: https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03
Result: i get the javaScript, Css
Desired output: i need the main html
i used this code
import requests
from bs4 import BeautifulSoup
url = 'https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(url,headers=headers)url = 'https://greyhoundbet.racingpost.com//#results-dog/race_id=1765914&dog_id=527442&r_date=2020-03-19&track_id=61&r_time=11:03'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
I’m afraid you won’t be able to get it directly using BeautifulSoup because the page loads then a javascript loads data.
It’s one of the component’s limitations, you may need to use selenium.
please check the answers on this question
I think what you looking for is this:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
It will contain the text from the page including html tags

web.Whatsapp headlessly using phantomjs

Using Phantomjs with to start web session on web.whatsapp.com, using chrome's user-agent as whatsapp not support phantomjs as user-agent
Code as Follows :
var page = require('webpage').create();
page.settings.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36';
page.viewportSize = {
width: 1200,
height: 800
};
page.open('https://web.whatsapp.com/', function() {
page.render('home.png');
phantom.exit();
});
But the output is blank white screen with dot on center
script output screenshot
any bug in my code or is there any compatible issue ?
Phantomjs is not waiting to load page completely, you can see elastic loading page icon.
Try this code with sleep.
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
user_agent = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
)
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = user_agent
driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path=r'/bin/phantomjs')
driver.get('http://web.whatsapp.com')
timeout = 30
try:
element_present = EC.presence_of_element_located((By.Class, 'qrcode'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print "Timed out waiting for page to load"
Note : whatsapp need cryptoSha256 and cryptoAesCbc supported browser for proper crypt management, Phantom js is not supporting cryptoSha256 and cryptoAesCbc.

Node horseman not working on AngularJS select options

I am trying to change AngularJS based select options using horseman.
Unfortunately, it is not working out for me.
The website is: https://www.cars.com/
I can't seem to change the make, model, price drop downs.
horseman
.userAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36')
.open('https://www.cars.com/').catch(function(error){console.log(error)})
.select('div.sw-input-group-make > select', car.make)
.wait(5000)
.select('.sw-input-group-model > select', car.model)
.select('.sw-input-group-price > select', car.price)
.type('.zip-field',car.zipcode)
.screenshot("C:/Users/Himanshu/Desktop/upwork/car/big1.png").log()
.click('.sw-input-group-submit >input').catch(function(error){console.log(error)})
.waitForNextPage().catch(function(error){console.log(error)})
.screenshot("C:/Users/Himanshu/Desktop/upwork/car/big.png").log()
.close();
});

Resources