I'm trying to scrape data from a website, and trying to run multiple chrome browsers to simultaneously download these files to speed up the process. If I use a single window, this script runs fine. However, there are two issues I'm running into -
a) Many of the browser windows do not close.
b) While the program does run and downloads files for a while, it stops after some time. Error message - 'ERROR:shader_disk_cache.cc(238)] Failed to create shader cache entry -2'
My chromedriver is in 'D:\401\401k'
Script -
'''Downloading 5500 forms from ERISA'''
#Import Library
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time, os, timeit, shutil
import pandas as pd
from multiprocessing import Pool
#Clean up download folder
if os.path.exists('D:/Form5500_Downloads'):
shutil.rmtree('D:/Form5500_Downloads')
os.makedirs('D:/Form5500_Downloads')
else:
os.makedirs('D:/Form5500_Downloads')
'''Function to download a single form using ACK ID'''
def download_form(ackid):
#Setting Chrome preferences
chromeOptions = webdriver.ChromeOptions()
prefs = {"download.default_directory" : "D:/Form5500_Downloads"} #Download folder
chromeOptions.add_experimental_option("prefs",prefs)
path_to_chromedriver = 'D:/401/401k/chromedriver_2.35'
browser = webdriver.Chrome(executable_path = r'D:\401\401k\chromedriver_2.35.exe', chrome_options=chromeOptions)
browser.implicitly_wait(3) #Implicit wait untile the element appears
# Open ERISA website
url = 'https://www.efast.dol.gov/portal/app/disseminate?execution=e1s4#'
browser.get(url)
# Search for a form using ACK ID
browser.find_element_by_css_selector('#ackId').send_keys(ackid)
browser.find_element_by_css_selector('.ui-icon-search').click()
#Check if the form exists - if NOT, exit the function
try:
browser.find_element_by_css_selector('#form\:filingTreeTable\:0\:einLnk').click()
except:
browser.find_element_by_css_selector('#ackId').clear()# delete the ackid value
browser.quit()
#Wait until downloaded and rename using ackid
print(ackid)
while not os.path.exists("D:/Form5500_Downloads/filing.pdf"):
time.sleep(5)
os.rename("D:/Form5500_Downloads/filing.pdf","D:/Form5500_Downloads/"+ackid+".pdf")
browser.quit()
def main():
'''Download'''
# Get list of ackids from csv file
df = pd.read_csv('D:/401/401k/F_SCH_H_2015_latest.csv',usecols=[0], nrows=10000)
ackid_list = df['ACK_ID'].tolist()
if __name__ == '__main__':
with Pool(10) as p:
records = p.map(download_form, ackid_list)
main()
This issue was resolved using the following -
browser.stop_client()
browser.close()
instead of
browser.quit()
I still do not understand why this hack works though.
Related
I understand this question has been asked but I need some solution for this error:
Traceback (most recent call last):
File "goeventz_automation.py", line 405, in <module>
if login(driver) is not None:
File "goeventz_automation.py", line 149, in login
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#track-element='header-login']"))).click()
File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/support/wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
This is the code where its getting error:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException
import urllib.request as request
import urllib.error as error
from PIL import Image
from selenium.webdriver.chrome.options import Options
import datetime as dt
import time
from common_file import *
from login_credentials import *
def login(driver):
global _email, _password
if waiter(driver, "//a[#track-element='header-login']") is not None:
#login = driver.find_element_by_xpath("//a[#track-element='header-login']")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#track-element='header-login']"))).click()
#login.click()
if waiter(driver,"//input[#id='user_email']") is not None:
email = driver.find_element_by_xpath("//input[#id='user_email']")
password = driver.find_element_by_xpath("//input[#id='password']")
email.send_keys(_email)
password.send_keys(_password)
driver.find_element_by_xpath("//button[#track-element='click-for-login']").click()
return driver
else:
print("There was an error in selecting the email input field. It may be the page has not loaded properly.")
return None
else:
print("There was an error in selecting the header-login attribute on the page.")
return None
if __name__ == '__main__':
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('/usr/bin/chromium/chromedriver',chrome_options=chrome_options)
#d.get('https://www.google.nl/')
#driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.goeventz.com/')
if login(driver) is not None:
print(create_event(driver))
I think there is some problem with Keys.ENTER, but I don't know how to solve this. I have tried every possible solution.............
This error message...
selenium.common.exceptions.ElementNotInteractableException: Message: element not interactable
...implies that the desired element was not interactable when you tried to invoke click() on it.
A couple of facts:
When you initialize the Chrome browser always in maximized mode.
You can disable-extensions.
You need to disable-infobars as well.
I have used the same xpath which you have constructed and you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.add_argument("start-maximized");
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get("https://www.goeventz.com/")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[#track-element='header-login']"))).click()
Browser Snapshot:
copy full xpath instead of copying only xpath. It will work
Instead of using login.send_keys(Keys.ENTER) you should use selenium click() method which would work fine for you.
You can check first if the element is clickable first and then you can click on it.
Like:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#track-element='header-login']"))).click()
Overview
It seems like you're having an XPATH problem finding the "Submit" button or your Submit button is not clickable, or your Submit button has some client side events attached to it (javascript/etc) that are required in order to effectively submit the page.
Calling the pw.submit() method in most cases should get rid of the need to wait for the submit button to become clickable and avoid any issues in locating the button in most cases. On many other websites, some of the necessary back-end processes are primed by client-side activities that are performed after the "submit" button is actually clicked (although on a side-note this is not considered best-practice because it makes the site less accessible, etc, I digress). Above all, it's important to watch your script execute and make sure that you're not getting any noticeable errors displayed on the webpage about the credentials that you're submitting.
Also, however, some websites require that you add a certain minimum amount of time between the entry of the username, password, and submitting the page in order for it to be considered a valid submitting process. I've even run in to websites that require you to use send_keys 1 at a time for usernames and passwords to avoid some anti-scraping technologies they employ. In these cases, I usually use the following between the calls:
from random import random, randint
def sleepyTime(first=5, second=10):
# returns the value of the time slept (as float)
# sleeps a random amount of time between the number variable in first
# and the number variable second (in seconds)
sleepy_time = round(random() * randint(first, second), 2)
sleepy_time = sleepy_time if sleepy_time > first else (first + random())
sleep(sleepy_time)
return sleepy_time
I don't see what use you have for making the _email and _password variables global, unless they are being changed somewhere in the login function and you want that change to be precipitated out to the other scopes.
How I would try to solve it
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException, TimeoutException
TIME_TIMEOUT = 20 # Twenty-second timeout default
def eprint(*args, **kwargs):
""" Prints an error message to the user in the console (prints to sys.stderr), passes
all provided args and kwargs along to the function as usual. Be aware that the 'file' argument
to print can be overridden if supplied again in kwargs.
"""
print(*args, file=sys.stderr, **kwargs)
def login(driver):
global _email, _password
try:
email = WebDriverWait(driver, TIME_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, "//input[#id='user_email']")))
pw = WebDriverWait(driver, TIME_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, "//input[#id='password']"))
pw.submit()
# if this doesn't work try the following:
# btn_submit = WebDriverWait(driver, TIME_TIMEOUT).until(EC.element_to_be_clickable((By.XPATH, "//button[#track-element='click-for-login']"))
# btn_submit.click()
# if that doesn't work, try to add some random wait times using the
# sleepyTime() example from above to add some artificial waiting to your email entry, your password entry, and the attempt to submit the form.
except NoSuchElementException as ex:
eprint(ex.msg())
except TimeoutException as toex:
eprint(toex.msg)
if __name__ == '__main__':
driver = webdriver.Chrome('/usr/bin/chromium/chromedriver',chrome_options=chrome_options)
#d.get('https://www.google.nl/')
#driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.goeventz.com/')
if login(driver) is not None:
print(create_event(driver))
For headless chrome browser you need to provide window size as well in chrome options.For headless browser selenium unable to know what your window size.Try that and let me know.
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('window-size=1920x1480')
I faced this error as well. Now check your browser if the element is inside the iframe. If so, use driver.find_element(By.CSS_SELECTOR, "#payment > div > div > iframe") and driver.switch_to.frame(iframe) Then you will be able to work out.
I understand this question has been asked but I need some solution for this error:
Traceback (most recent call last):
File "goeventz_automation.py", line 405, in <module>
if login(driver) is not None:
File "goeventz_automation.py", line 149, in login
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#track-element='header-login']"))).click()
File "/usr/local/lib/python3.6/dist-packages/selenium/webdriver/support/wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
This is the code where its getting error:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException
import urllib.request as request
import urllib.error as error
from PIL import Image
from selenium.webdriver.chrome.options import Options
import datetime as dt
import time
from common_file import *
from login_credentials import *
def login(driver):
global _email, _password
if waiter(driver, "//a[#track-element='header-login']") is not None:
#login = driver.find_element_by_xpath("//a[#track-element='header-login']")
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#track-element='header-login']"))).click()
#login.click()
if waiter(driver,"//input[#id='user_email']") is not None:
email = driver.find_element_by_xpath("//input[#id='user_email']")
password = driver.find_element_by_xpath("//input[#id='password']")
email.send_keys(_email)
password.send_keys(_password)
driver.find_element_by_xpath("//button[#track-element='click-for-login']").click()
return driver
else:
print("There was an error in selecting the email input field. It may be the page has not loaded properly.")
return None
else:
print("There was an error in selecting the header-login attribute on the page.")
return None
if __name__ == '__main__':
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('/usr/bin/chromium/chromedriver',chrome_options=chrome_options)
#d.get('https://www.google.nl/')
#driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.goeventz.com/')
if login(driver) is not None:
print(create_event(driver))
I think there is some problem with Keys.ENTER, but I don't know how to solve this. I have tried every possible solution.............
This error message...
selenium.common.exceptions.ElementNotInteractableException: Message: element not interactable
...implies that the desired element was not interactable when you tried to invoke click() on it.
A couple of facts:
When you initialize the Chrome browser always in maximized mode.
You can disable-extensions.
You need to disable-infobars as well.
I have used the same xpath which you have constructed and you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.add_argument("start-maximized");
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get("https://www.goeventz.com/")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[#track-element='header-login']"))).click()
Browser Snapshot:
copy full xpath instead of copying only xpath. It will work
Instead of using login.send_keys(Keys.ENTER) you should use selenium click() method which would work fine for you.
You can check first if the element is clickable first and then you can click on it.
Like:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//a[#track-element='header-login']"))).click()
Overview
It seems like you're having an XPATH problem finding the "Submit" button or your Submit button is not clickable, or your Submit button has some client side events attached to it (javascript/etc) that are required in order to effectively submit the page.
Calling the pw.submit() method in most cases should get rid of the need to wait for the submit button to become clickable and avoid any issues in locating the button in most cases. On many other websites, some of the necessary back-end processes are primed by client-side activities that are performed after the "submit" button is actually clicked (although on a side-note this is not considered best-practice because it makes the site less accessible, etc, I digress). Above all, it's important to watch your script execute and make sure that you're not getting any noticeable errors displayed on the webpage about the credentials that you're submitting.
Also, however, some websites require that you add a certain minimum amount of time between the entry of the username, password, and submitting the page in order for it to be considered a valid submitting process. I've even run in to websites that require you to use send_keys 1 at a time for usernames and passwords to avoid some anti-scraping technologies they employ. In these cases, I usually use the following between the calls:
from random import random, randint
def sleepyTime(first=5, second=10):
# returns the value of the time slept (as float)
# sleeps a random amount of time between the number variable in first
# and the number variable second (in seconds)
sleepy_time = round(random() * randint(first, second), 2)
sleepy_time = sleepy_time if sleepy_time > first else (first + random())
sleep(sleepy_time)
return sleepy_time
I don't see what use you have for making the _email and _password variables global, unless they are being changed somewhere in the login function and you want that change to be precipitated out to the other scopes.
How I would try to solve it
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException, TimeoutException
TIME_TIMEOUT = 20 # Twenty-second timeout default
def eprint(*args, **kwargs):
""" Prints an error message to the user in the console (prints to sys.stderr), passes
all provided args and kwargs along to the function as usual. Be aware that the 'file' argument
to print can be overridden if supplied again in kwargs.
"""
print(*args, file=sys.stderr, **kwargs)
def login(driver):
global _email, _password
try:
email = WebDriverWait(driver, TIME_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, "//input[#id='user_email']")))
pw = WebDriverWait(driver, TIME_TIMEOUT).until(EC.presence_of_element_located((By.XPATH, "//input[#id='password']"))
pw.submit()
# if this doesn't work try the following:
# btn_submit = WebDriverWait(driver, TIME_TIMEOUT).until(EC.element_to_be_clickable((By.XPATH, "//button[#track-element='click-for-login']"))
# btn_submit.click()
# if that doesn't work, try to add some random wait times using the
# sleepyTime() example from above to add some artificial waiting to your email entry, your password entry, and the attempt to submit the form.
except NoSuchElementException as ex:
eprint(ex.msg())
except TimeoutException as toex:
eprint(toex.msg)
if __name__ == '__main__':
driver = webdriver.Chrome('/usr/bin/chromium/chromedriver',chrome_options=chrome_options)
#d.get('https://www.google.nl/')
#driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.goeventz.com/')
if login(driver) is not None:
print(create_event(driver))
For headless chrome browser you need to provide window size as well in chrome options.For headless browser selenium unable to know what your window size.Try that and let me know.
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('window-size=1920x1480')
I faced this error as well. Now check your browser if the element is inside the iframe. If so, use driver.find_element(By.CSS_SELECTOR, "#payment > div > div > iframe") and driver.switch_to.frame(iframe) Then you will be able to work out.
I am able to successfully open a URL and save the resultant page as a .html file. However, I am unable to determine how to download and save a .mhtml (Web Page, Single File).
My code is:
import urllib.parse, time
from urllib.parse import urlparse
import urllib.request
url = ('https://www.example.com')
encoded_url = urllib.parse.quote(url, safe='')
print(encoded_url)
base_url = ("https://translate.google.co.uk/translate?sl=auto&tl=en&u=")
translation_url = base_url+encoded_url
print(translation_url)
req = urllib.request.Request(translation_url, headers={'User-Agent': 'Mozilla/6.0'})
print(req)
response = urllib.request.urlopen(req)
time.sleep(15)
print(response)
webContent = response.read()
print(webContent)
f = open('GoogleTranslated.html', 'wb')
f.write(webContent)
print(f)
f.close
I have tried to use wget using the details captured in this question:
How to download a webpage (mhtml format) using wget in python but the details are incomplete (or I am simply unabl eto understand).
Any suggestions would be helpful at this stage.
Did you try using Selenium with a Chrome Webdriver to save page?
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_element_located
from selenium.webdriver.support.ui import WebDriverWait
import pyautogui
URL = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
FILE_NAME = ''
# open page with selenium
# (first need to download Chrome webdriver, or a firefox webdriver, etc)
driver = webdriver.Chrome()
driver.get(URL)
# wait until body is loaded
WebDriverWait(driver, 60).until(visibility_of_element_located((By.TAG_NAME, 'body')))
time.sleep(1)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if FILE_NAME != '':
pyautogui.typewrite(FILE_NAME)
pyautogui.hotkey('enter')
I have a better solution, which will not involve any possible manual operation and specify the path to hold the mhtml file. I learn this from a chinese blog . The key idea is to use chrome-dev-tools command.
The code is shown below as an example.
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.qq.com/')
# Execute Chrome dev tool command to obtain the mhtml file
res = driver.execute_cdp_cmd('Page.captureSnapshot', {})
# 2. write file locally
with open('./store/qq.mhtml', 'w', newline='') as f:
f.write(res['data'])
driver.quit()
Hope this will help!
more things about chrome dev protocols
save as mhtml, need to add argument '--save-page-as-mhtml'
options = webdriver.ChromeOptions()
options.add_argument('--save-page-as-mhtml')
driver = webdriver.Chrome(options=options)
I wrote it just the way it was. Sorry if it's wrong.
I created a class, so you can use it. The example is in the three lines below.
Also, you can change the number of seconds to sleep as you like.
Incidentally, non-English keyboards such as Japanese and Hangul keyboards are also supported.
import chromedriver_binary
from selenium import webdriver
import pyautogui
import pyperclip
import uuid
class DonwloadMhtml(webdriver.Chrome):
def __init__(self):
super().__init__()
self._first_save = True
time.sleep(2)
def save_page(self, url, filename=None):
self.get(url)
time.sleep(3)
# open 'Save as...' to save html and assets
pyautogui.hotkey('ctrl', 's')
time.sleep(1)
if filename is None:
pyperclip.copy(str(uuid.uuid4()))
else:
pyperclip.copy(filename)
time.sleep(1)
pyautogui.hotkey('ctrl', 'v')
time.sleep(2)
if self._first_save:
pyautogui.hotkey('tab')
time.sleep(1)
pyautogui.press('down')
time.sleep(1)
pyautogui.press('up')
time.sleep(1)
pyautogui.hotkey('enter')
time.sleep(1)
self._first_save = False
pyautogui.hotkey('enter')
time.sleep(1)
# example
dm = DonwloadMhtml()
dm.save_page('https://en.wikipedia.org/wiki/Python_(programming_language)', 'wikipedia_python') # create file named "wikipedia_python.mhtml"
dm.save_page('https://www.python.org/') # file named randomly based on uuid4
python3.8.10
selenium==4.4.3
I'm trying to gather some information from certain webpages using selenium and python.I have a working code for a single tab. But now i have a situation where i need to open 50 tabs in chrome at once and process each page data.
1) So open 50 tabs at once - The code i got already
2) Change the control between tabs and process the information from the page and close the tab and move to next tab and do the same.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import psycopg2
import os
import datetime
final_results=[]
positions=[]
saerched_url=[]
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#options.add_argument('--headless')
options.add_argument("—-incognito")
browser = webdriver.Chrome(executable_path='/users/user_123/downloads/chrome_driver/chromedriver', chrome_options=options)
browser.implicitly_wait(20)
#def db_connect():
try:
DSN = "dbname='postgres' user='postgres' host='localhost' password='postgres' port='5432'"
TABLE_NAME = 'staging.search_url'
conn = psycopg2.connect(DSN)
print("Database connected...")
cur = conn.cursor()
cur.execute("SET datestyle='German'")
except (Exception, psycopg2.Error) as error:
print('database connection failed')
quit()
def get_products(url):
browser.get(url)
names = browser.find_elements_by_xpath("//span[#class='pymv4e']")
upd_product_name_list=list(filter(None, names))
product_name = [x.text for x in upd_product_name_list]
product = [x for x in product_name if len(x.strip()) > 2]
upd_product_name_list.clear()
product_name.clear()
return product
links = ['https://www.google.com/search?q=Vitamin+D',
'https://www.google.com/search?q=Vitamin+D3',
'https://www.google.com/search?q=Vitamin+D+K2',
'https://www.google.com/search?q=D3',
'https://www.google.com/search?q=Vitamin+D+1000']
for link in links:
# optional: we can wait for the new tab to open by comparing window handles count before & after
tabs_count_before = len(browser.window_handles)
# open a link
control_string = "window.open('{0}')".format(link)
browser.execute_script(control_string)
# optional: wait for windows count to increment to ensure new tab is opened
WebDriverWait(browser, 1).until(lambda browser: tabs_count_before != len(browser.window_handles))
# get list of currently opened tabs
tabs_list = browser.window_handles
print(tabs_list)
# switch control to newly opened tab (the last one in the list)
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
# now you can process data on the newly opened tab
print(browser.title)
for lists in tabs_list:
last_tab_opened = tabs_list[len(tabs_list)-1]
browser.switch_to_window(last_tab_opened)
filtered=[]
filtered.clear()
filtered = get_products(link)
saerched_url.clear()
if not filtered:
new_url=link+'+kaufen'
get_products(link)
print('Modified URL :'+link)
if filtered:
print(filtered)
positions.clear()
for x in range(1, len(filtered)+1):
positions.append(str(x))
saerched_url.append(link)
gobal_position=0
gobal_position=len(positions)
print('global postion first: '+str(gobal_position))
print("\n")
company_name_list = browser.find_elements_by_xpath("//div[#class='LbUacb']")
company = []
company.clear()
company = [x.text for x in company_name_list]
print('Company Name:')
print(company, '\n')
price_list = browser.find_elements_by_xpath("//div[#class='e10twf T4OwTb']")
price = []
price.clear()
price = [x.text for x in price_list]
print('Price:')
print(price)
print("\n")
urls=[]
urls.clear()
find_href = browser.find_elements_by_xpath("//a[#class='plantl pla-unit-single-clickable-target clickable-card']")
for my_href in find_href:
url_list=my_href.get_attribute("href")
urls.append(url_list)
print('Final Result: ')
result = zip(positions,filtered, urls, company,price,saerched_url)
final_results.clear()
final_results.append(tuple(result))
print(final_results)
print("\n")
print('global postion end :'+str(gobal_position))
i=0
try:
for d in final_results:
while i <= gobal_position:
print( d[i])
cur.execute("""INSERT into staging.pla_crawler_results(position, product_name, url,company,price,searched_url) VALUES (%s, %s, %s,%s, %s,%s)""", d[i])
print('Inserted succesfully')
conn.commit()
i=i+1
except (Exception, psycopg2.Error) as error:
print (error)
pass
browser.close()
Ideally you shouldn't attempt to open 50 tabs at once as:
Handling 50 concurrent TABs through Selenium will invite complicated logic/algorithm to maintain.
Additionally, you may run into CPU and memory usage issues as:
Chrome maintains many processes.
Where as at times Firefox uses too much RAM
Solution
If you are having a List of the urls as follows:
['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
You can iterate over the list to open them one by one in the adjacent tab for scraping using the following Locator Strategy:
Code Block:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.common.keys import Keys
links = ['https://selenium.dev/downloads/', 'https://selenium.dev/documentation/en/']
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
for link in links:
driver = webdriver.Chrome(options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get(link)
print(driver.title)
print("Perform webscraping here")
driver.quit()
print("End of program")
Console Output:
Downloads
Perform webscraping here
The Selenium Browser Automation Project :: Documentation for Selenium
Perform webscraping here
End of program
Reference
You can find a relevant detailed discussion in:
WebScraping JavaScript-Rendered Content using Selenium in Python
My python selenium tests are working on firefox dirver (GUI) without any issue. But i wanted to run my tests in headless mode. When i try to run the same script with headless mode (with few modifications). it gives wierd errors.
Ex:
selenium.common.exceptions.NoSuchElementException: Message{"errorMessage":"Unable to find element with id 'ext-gen1499
python script :
import os
import time
from selenium.webdriver.common.proxy import *
from selenium.webdriver.common.by import By
phantomjs_path=r"/home/xxxx/nodejs-installs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs"
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
service_args = ['--proxy=x.x.x.x:80','--proxy-type=https']
driver = webdriver.PhantomJS(executable_path=r'/home/xxxx/nodejs-installs/phantomjs-2.1.1-linux-x86_64/bin/phantomjs',service_args=service_args)
os.environ['MOZ_HEADLESS'] = '1'
driver.get("https://aaaaa.com")
def Login():
try:
driver.find_element_by_id("username").send_keys("test#aaaa.com")
driver.find_element_by_id("password").send_keys("xxxxxxx")
driver.find_element_by_id("Submit").click()
login_flag=1
except:
print("Error Loading Login page")
login_flag=0
finally:
return login_flag
def CreateMail():
try:
element = WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.ID, "button-1143-btnInnerEl")))
driver.find_element_by_id("button-1143-btnInnerEl").click()
except TimeoutException:
print ("Loading took too much time - Create New Mail")
driver.find_element_by_id("ext-gen1499").send_keys("test#test.com")
driver.find_element_by_id("textfield-1265-inputEl").send_keys("Automated Test Mail from Selenium")
driver.find_element_by_id("button-1252-btnIconEl").click()
Am i missing anything ?
It is a good practice to add an implicit wait of at-least 10 seconds , for allowing the target page element/s to load completely.
driver.implicitly_wait(10)