Multithreading optimization in python and selenium - multithreading

I've written a script for automating the process of logging in to a website and taking screenshots of a particular page using Selenium and the undetected_chromedriver library. The script reads a list of usernames and passwords from an Excel file (users.xlsx), and then creates a queue of tasks for each user to log in and take a screenshot of a specific web page. The script uses multiple threads to handle the queue, with a maximum number of workers that can be set to the number of users. Once a task is completed, the driver is quit and the next task is started. The output of the script is a set of PDF files with the usernames as the filename.
Issue with the script is that it takes too long to (5 seconds) from finishing the task and closing the driver to start a new one.
I'm not sure the issue is with the run_threads function or the get_ss_hbd function.
I'm looking for ways to optimize this code and reduce runtime.
import undetected_chromedriver as uc
import threading
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException
import pandas as pd
import time
import base64
import os
import queue
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--enable-javascript')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('User-Agent={0}'.format(user_agent))
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('--kiosk-printing')
chrome_options.add_argument("--window-size=2000x2000")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', True)
logout = "***********"
# Test with example username and password lists
df = pd.read_excel("users.xlsx")
username_list = df['username'].tolist()
password_list = df['password'].tolist()
width = 500
height = 600
def login_hbd(driver, username, password):
driver.get("******")
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.NAME, "username")))
username_field = driver.find_element(By.NAME, "username")
username_field.send_keys(username)
password_field = driver.find_element(By.NAME, "password")
password_field.send_keys(password)
submit_btn = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.ID, "submitBtn")))
driver.execute_script("arguments[0].click();", submit_btn)
def get_ss_hbd(username, password, x, y, width, height, start_time):
with uc.Chrome() as driver:
driver.set_window_position(x, y)
driver.set_window_size(width, height)
while True:
try:
login_hbd(driver, username, password)
WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.XPATH, "//a[text()='s']")))
driver.get("******")
time.sleep(1.5)
pdf = driver.execute_cdp_cmd(
"Page.printToPDF", {
"printBackground": True,
"landscape": False,
"displayHeaderFooter": False,
"scale": 1,
})
with open(f'{username}.pdf', "wb") as f:
f.write(base64.b64decode(pdf['data']))
# Return the path to the PDF file
end_time = time.monotonic()
elapsed_time = end_time - start_time
print(f"Thread for {username} finished in {elapsed_time:.2f} seconds.")
return driver
break
except (NoSuchElementException, ElementClickInterceptedException, TimeoutException):
continue
def run_threads(username_list, password_list, width, height):
num_users = len(username_list)
x_positions = range(0, width*num_users, width)
max_workers = 2 # Set max_workers to the number of users
task_queue = queue.Queue()
# Fill the task queue with the user credentials
for i in range(num_users):
task_queue.put((i, username_list[i], password_list[i], x_positions[i], 0, width, height))
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {}
completed = {}
# Submit the tasks to the thread pool executor
while not task_queue.empty() or len(futures) > 0:
# Check if there are any completed tasks
for task_id, future in futures.items():
if future.done():
try:
result = future.result()
result.quit() # Quit the driver when the task is done
except Exception as exc:
print(f"generated an exception: {exc}")
else:
print(f"Thread for {result} finished.")
completed[task_id] = future
# Remove the completed tasks from the futures dictionary
for task_id in completed:
del futures[task_id]
completed.clear()
# Submit new tasks if there are any available
while not task_queue.empty() and len(futures) < max_workers:
task_id, username, password, x, y, width, height = task_queue.get()
start_time = time.monotonic() # Start the timer
future = executor.submit(get_ss_hbd, username, password, x, y, width, height, start_time)
futures[task_id] = future
# Wait for a short while to avoid spinning the loop too fast
time.sleep(0.1)
run_threads(username_list, password_list, width, height)

Related

Python Selenium for getting the whole content of table in Reactjs

I tried to scrape the content of table from wyscout.com, which seems that is built by Reacjs.
After log in, script selects the country(e.g. England), League(e.g. Premier League), Team(e.g. Arsenal). Here choose Stats tab.
Then, it shows the table to scrape the data. Even if there is a button to export a excel file, I want to scrape the content manually using selenium or beautifulsoup.
However, script gets only 18 rows even though the number of rows on the table is more than 100.
Please let me know the solution.
Thanks.
Here is my code.
from re import search
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import string
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
# Load Chrome Browser
show_browser = True
options = Options()
# options.add_argument('--headless')
scraped_data = []
def bot_driver(url, user_name, user_password):
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(url)
driver.maximize_window()
time.sleep(2)
# Log in
# login = driver.find_element_by_xpath("//ul[#id='avia-menu']/li[5]/a")
# login.click()
time.sleep(10)
idd = driver.find_element_by_xpath("//input[#id='login_username']")
idd.send_keys(user_name)
passW = driver.find_element_by_xpath("//input[#id='login_password']")
passW.send_keys(user_password)
time.sleep(2)
submit = driver.find_element_by_xpath("//button[#id='login_button']")
submit.click()
time.sleep(10)
try:
force_login = driver.find_element_by_xpath("//button[#class='btn2_zFM sc-jDwBTQ cUKaFo -block3u2Qh -primary1dLZk']")
force_login.click()
print('force loging')
except :
print('force login error')
return driver
def select_country(driver, country_name):
# Specific Country
# country = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#commandsource='list#area_list#30']")))
# country = driver.find_element_by_xpath("//div[#commandsource='list#area_list#30']")
# All the countries
list_country = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//div[#id='detail_0_home_navy']/div[1]/div/div")))
time.sleep(3)
for entry in list_country:
if country_name == entry.text:
print('country click here')
entry.click()
return driver, 1
return driver, 0
def select_league(driver, league_name):
# Specific League
# league = driver.find_element_by_xpath("//div[#commandsource='list#competition_list#0']")
# All the leagues
list_league = driver.find_elements_by_xpath("//div[#id='detail_0_area_navy_0']/div[1]/div/div")
for entry in list_league:
if league_name == entry.text:
entry.click()
return driver, 1
return driver, 0
def select_team(driver, team_names):
# Specific Team
# team = driver.find_element_by_xpath("//div[#commandsource='list#team_list#0']")
flag_team = 0
list_team = driver.find_elements_by_xpath("//div[#id='detail_0_competition_navy_0']/div[1]/div/div")
for entry in list_team:
if entry.text in team_names:
flag_team = 1
print('selected team = ', entry.text)
entry.click()
time.sleep(2)
# Stats
stats = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, 'Stats')))
stats.click()
time.sleep(3)
WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.XPATH, "//div[#id='detail_0_team_stats']/div/div/div/main/div[3]/div[2]/div/table")))
content_stats = driver.page_source
soup_stats = BeautifulSoup(content_stats, "html.parser")
table_stats = soup_stats.find('table', attrs={'class': 'teamstats__Index-module__table___1K93L teamstats__Index-module__with-opp___16Rp5'})
# print(table_stats)
tbody_stats = table_stats.find('tbody')
tr_stats = tbody_stats.find_all('tr')
print('number of tr = ', len(tr_stats))
# Return to team selection
back_team = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//div[#id='detail_0_team_back']")))
back_team.click()
time.sleep(5)
if flag_team == 1:
return driver, 1
else:
return driver, 0
if __name__ == "__main__":
# User input
# Login - wyscout_url = 'https://wyscout.com/'
wyscout_url = 'https://platform.wyscout.com/app/?/'
wyscout_user_name = '' # username
wyscout_user_password = '' # password
wyscout_driver = bot_driver(wyscout_url, wyscout_user_name, wyscout_user_password)
time.sleep(10)
# Select a Country
country = 'England' # .upper()
wyscout_driver, succeed = select_country(wyscout_driver, country)
if succeed == 0:
print('NO country!')
time.sleep(7)
# Select a league
league = 'Premier League' # .upper()
wyscout_driver, succeed = select_league(wyscout_driver, league)
if succeed == 0:
print('NO League!')
time.sleep(7)
# Select team
team_options = ['Arsenal']
wyscout_driver, succeed = select_team(wyscout_driver, team_options)
time.sleep(7)
if succeed == 0:
print('NO Team!')
time.sleep(7)
print('!!!Wyscout END!!!')
# wyscout_driver.quit()
Finally I figured it out by myself.
Here is my solution.
# Scroll down
print('scroll down')
last_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
time.sleep(3)
while True:
driver.execute_script("arguments[0].scrollBy(0,arguments[0].scrollHeight)", table_stats)
time.sleep(5)
new_height = driver.execute_script("return arguments[0].scrollHeight;", table_stats)
if new_height == last_height:
break
last_height = new_height
print('scroll end')

Selenium Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'

I'm currently working on a python bot program that grabs 3 way odds(home, draw, away) of the same soccer match(same clubs) from two different bookie websites, then compares the odds and gets the large odds, the large odds are finally passed into an arbitrage calculator function named oddchecker.py to get the total implied probability. For example: Osasuna vs Barcelona, bookie site one has the three odds as: 4.60, 3.60, 1.90 and bookie two has odds as 4.70, 3.60, 1.87, so my bot will grab4.70, 3.60, 1.90.
I'm using python and selenium for this project.
The bot is made up of four functions, namely ab2.py, ab.py, main.py, and oddchecker.py
Have imported ab2.py, ab.py and oddchecker.py into main.py file main.py is my main function to run the entire bot code.
ab2.py queries one bookie site and ab.py queries the other bookie site.
My code is able to grab the odds from the two sites and perform the comparison, however it only prints home odd list(i'm storing the odds in a list for comparison purposes) for the two teams when i run the main.py, then code breaks and its unable to go back and grab for odds for draw and away.
Below is the error that i'm getting. Have done research regarding the error, and it seems that am making too many requests to the target server, i'm not sure if its a localhost port overuse or the target bookie servers. They say resolution is to implement sleep waits using time.sleep(10) module, in my code how can i implement that.
HTTPConnectionPool(host='localhost', port=58202): Max retries exceeded with url: /session/1eaec216e17f72e3a4f30235093e0021/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001F16BC30100>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
None
Below is my code
ab2.py
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # for implicit and explict waits
from selenium.webdriver.chrome.options import Options # for suppressing the browser
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument('headless')
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH, options=option)
import time
def betika_func():
web_betika= 'https://www.betika.com/s/soccer/spain-laliga'
driver.get(web_betika)
driver.implicitly_wait(3)
try:
btk_team1 = driver.find_element(By.XPATH, '//div[#class="prebet-match"]//div[#class="prebet-match__odd-market__container"]//div[#class="prebet-match__teams"]//span[#class="prebet-match__teams__home"]').get_attribute('innerHTML')
btk_team2 = driver.find_element(By.XPATH, '//div[#class="prebet-match"]//div[#class="prebet-match__odd-market__container"]//div[#class="prebet-match__teams"]/span[2]').get_attribute('innerHTML')
# odds = driver.find_element(By.XPATH, '//div[#class="prebet-match"]//div[#class="prebet-match__odd-market__container"]//div[#class="prebet-match__odds__container"]//[#class="prebet-match__odds"]//button[#class="prebet-match__odd"]//span[#class="prebet-match__odd__odd-value"]')
odds = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, 'prebet-match__odd__odd-value'))
)
hw = odds[0].get_attribute('innerHTML')
draw = odds[1].get_attribute('innerHTML')
aw = odds[2].get_attribute('innerHTML')
print(btk_team1)
print(btk_team2)
betika_home_odd = print('Betika home odd: '+hw)
betika_draw_odd = print('Betika draw odd: '+draw)
betika_away_odd = print('Betika away odd: '+aw)
betika_dict = {
'btk_home': hw,
'btk_draw': draw,
'btk_away': aw
}
return betika_dict
finally:
driver.quit()
# Testing the function
# response = betika_func()
# print(response)
ab.py code
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # for implicit and explict waits
from selenium.webdriver.chrome.options import Options # for suppressing the browser
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument('headless')
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH, options=option)
def odi_func():
web_odi = 'https://odibets.com/'
driver.get(web_odi)
driver.implicitly_wait(3)
try:
# Login form =================================================
# btn = driver.find_element(By.CSS_SELECTOR, 'button[id="mobile-web-login"]')
# btn.click()
# print('Login form opened')
# driver.implicitly_wait(2)
# phone_num = driver.find_element(By.XPATH, '//*[#placeholder="07xxxxxxxx or 01xxxxxxxx"]')
# password = driver.find_element(By.XPATH, '//*[#placeholder="xxxxxxxx"]')
# login = driver.find_element(By.XPATH, '//*[#id="modal"]/div/div/div/form/div[4]/button')
# phone_num.send_keys()
# password.send_keys()
# login.click()
# ==============================
# elem = driver.find_element_by_xpath("(//a[#href='/league'] [.//span[contains(.,'League')]])[1]")
# driver.execute_script("arguments[0].click()", elem)
# Print the first soccer title
# item = driver.find_element(By.XPATH, '//div[#class="l-events-games"]//div[#class="l-games-title"]//div[#class="l"]').get_attribute("innerHTML")
# print('The first sport league or title is: '+item)
# Get all clubs in the page.
teams = driver.find_elements(By.XPATH, '//div[2][#class="l-events-games"]//div[#class="l-events-games-matches"]//div[#class="l-games-event"]//a[#class="inf"]//div[#class="t-i"]')
# Get the first two club names in the page
odi_team1 = teams[0].get_attribute("innerHTML")
odi_team2 = teams[1].get_attribute("innerHTML")
print(odi_team1)
print(odi_team2)
# Get three way odds
odds = driver.find_elements(By.XPATH, '//div[2][#class="l-events-games"]//div[#class="l-events-games-matches"]//div[#class="l-games-event"]//div[#class="mar-cont"]//span[#class="b"]')
odi_hw = odds[0].get_attribute("innerHTML")
odi_draw = odds[1].get_attribute("innerHTML")
odi_aw = odds[2].get_attribute("innerHTML")
# print('Home odd for: '+team1+' is: '+odi_hw+' Draw odd is: '+odi_draw+ ' Away team is: '+team2+' Away win odd is: '+odi_aw)
odi_home_odd = print('Odi home odd is: '+odi_hw)
odi_draw_odd = print('Oddi draw odd is: '+odi_draw)
odi_away_odd = print('Odi away odd is: '+odi_aw)
odi_dict = {
'odi_home': odi_hw,
'odi_draw': odi_draw,
'odi_away': odi_aw
}
return odi_dict
finally:
driver.quit()
# Testing the function
# response = odi_func()
# print(response)
oddchecker.py code
def func(odd1, odd2, odd3):
implied1 = (1/odd1) * 100
implied2 = (1/odd2) * 100
implied3 = (1/odd3) * 100
roundedodd1 = round(implied1, 2)
roundedodd2 = round(implied2, 2)
roundedodd3 = round(implied3, 2)
totalimpliedprob = roundedodd1 + roundedodd2 + roundedodd3
print('Implied prob for: '+str(odd1)+' is '+str(roundedodd1))
print('Implied prob for: '+str(odd2)+' is '+str(roundedodd2))
print('Implied prob for: '+str(odd3)+' is '+str(roundedodd3))
totalimpliedprob = round(totalimpliedprob, 2)
# print('Total implied odd market probability is: '+str(totalimpliedprob))
return totalimpliedprob
# Testing the arbitrage function
# response = func(large_home_odd, large_draw_odd, large_away_odd)
# print(response)
Main.py code
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # for implicit and explict waits
from selenium.webdriver.chrome.options import Options # for suppressing the browser
from selenium.webdriver.support import expected_conditions as EC
option = webdriver.ChromeOptions()
option.add_argument('headless')
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH, options=option)
import time
import ab2 as betika
import ab as odi
import oddchecker as od
def main_func():
try:
home_list = []
draw_list = []
away_list = []
time.sleep(20)
odi_hm = odi.odi_func().get('odi_home')
btk_hm = betika.betika_func().get('btk_home')
home_list.append(odi_hm)
home_list.append(btk_hm)
print(home_list)
time.sleep(20)
odi_dr = odi.odi_func().get('odi_draw')
btk_dr = betika.betika_func().get('btk_draw')
home_list.append(odi_dr)
draw_list.append(btk_dr)
print(draw_list)
time.sleep(20)
odi_aw = odi.odi_func().get('odi_away')
btk_aw = betika.betika_func().get('btk_away')
home_list.append(odi_aw)
draw_list.append(btk_aw)
print(away_list)
# Get the large home odd
large_home_odd = max(home_list)
large_draw_odd = max(draw_list)
large_away_odd = max(away_list)
# Pass the three maximum odds to arbitrage calculator, to check if total implied probability
# is equal or less than 100 percent
final_response = od.func(large_home_odd, large_draw_odd, large_away_odd)
print(final_response)
print(str(large_home_odd))
except Exception as e:
print(e)
response = main_func()
print(response)

Python Selenium with multiprocessing causing runaway memory in chrome.exe processes

I was hoping someone could take a look at my code and explain to me why I am seeing a runaway memory issue in chrome.exe processes. When I run the program everything seems stable for a few hours, but after around 8 hours I will have a single chrome.exe process that consumes around 5Gb of memory. The application is fairly simple. for each item that I want to search, a new process is created. Inside that process I create a single driver instance and then search for an element. If the element isn't present then I refresh the driver and continue searching. Here is a generic sample of my code.
import time
from multiprocessing import Process
import datetime as dt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions as SE
import sendMail
class itemSearch(Process):
def __init__(self, item):
Process.__init__(self)
self.item = item
print("Starting Search for: "+str(self.item))
self.start()
def run(self):
"""For some reason multiprocessing was not allowing me to put the driver initializations outside of the run function. In threading I was able to pass the driver to init. Kept getting a PermissionError: [WinError 5] Access is denied. Putting the driver initialization into the run function seems to have fixed this issue. No fucking clue."""
options = Options()
options.add_experimental_option("detach",True)
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, timeout=20)
self.session = self.driver.session_id
self.driver.get(self.item)
#self.done = False
while not self.done:
self.search()
self.driver.close()
def search(self):
while True:
try:
print("Scanning for: "+str(self.item))
self.driver.find_element_by_xpath('//div[some xpath to a button]').click()
print("sending email")
url = self.driver.current_url
sendMail.sendNotification(receiver_email="yourmail.com", url=url)
break
except SE.NoSuchElementException:
print("Refreshing")
self.driver.refresh()
print(dt.datetime.now())
self.wait.until(EC.visibility_of_element_located((By.XPATH,'//div[some other xpath]')))
self.done = True
if __name__ == '__main__':
url1 = "https://www.somesite.com"
url2= "https://www.someothersite.com
searchItems = [url1, url2]
print("Starting search")
for item in searchItems:
print(item)
itemSearch(item)
As a work-around I added a function that check memory usage for all chrome.exe processes. The check is run on each loop iteration. I have set a max memory limit and once that limit is reached I close the chrome driver and make a call to the run function again. This is actually working very well for me. Here's the new code with the function incorporated:
import time
import psutil
from multiprocessing import Process
import datetime as dt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions as SE
import sendMail
class itemSearch(Process):
def __init__(self, item):
Process.__init__(self)
self.item = item
print("Starting Search for: "+str(self.item))
self.start()
def run(self):
"""For some reason multiprocessing was not allowing me to put the driver initializations outside of the run function. In threading I was able to pass the driver to init. Kept getting a PermissionError: [WinError 5] Access is denied. Putting the driver initialization into the run function seems to have fixed this issue. No fucking clue."""
options = Options()
options.add_experimental_option("detach",True)
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, timeout=20)
self.session = self.driver.session_id
self.driver.get(self.item)
#self.done = False
while not self.done:
self.search()
self.driver.close()
def getMemoryUsage(self):
"Return the MB of ram being used by chrome."
process_list = []
total_mem = 0
for p in psutil.process_iter(['name']):
if p.info['name'] == "chrome.exe":
process_list.append(p.pid)
#Calculate total memory usage
for pid in process_list:
try:
#logger.info(str(pid)+" = "+str(psutil.Process(pid).memory_info().private/1000000))
total_mem += psutil.Process(pid).memory_info().private
except psutil.NoSuchProcess:
#logger.info("Process "+str(pid)+" not present")
pass
return total_mem/1000000
def search(self):
while True:
try:
print("Scanning for: "+str(self.item))
self.driver.find_element_by_xpath('//div[some xpath to a button]').click()
print("sending email")
url = self.driver.current_url
sendMail.sendNotification(receiver_email="yourmail.com", url=url)
break
except SE.NoSuchElementException:
print("Refreshing")
self.driver.refresh()
print(dt.datetime.now())
self.wait.until(EC.visibility_of_element_located((By.XPATH,'//div[some other xpath]')))
memUsage = self.getMemoryUsage()
print("Current Memory Usage at: "+str(memUsage)+"MB")
if memUsage > 7000:
#print("Memory Usage reached " +str(memUsage) +"MB. Restarting driver")
logger.info("Memory Usage reached " +str(memUsage) +"MB. Restarting driver")
self.driver.quit()
self.run()
self.done = True
if __name__ == '__main__':
url1 = "https://www.somesite.com"
url2= "https://www.someothersite.com
searchItems = [url1, url2]
print("Starting search")
for item in searchItems:
print(item)
itemSearch(item)

Python, Selenium and Chromedriver

I'm trying to test out chromedriver and headless chrome on this site.
https://car.gocompare.com/vehicle
However when i try with normal chrome it works fine, I'll get a response for a car reg I've put in.
When I use headless chrome it says car cannot be found.
Does anyone know what could be up with it, is it the driver, or the website that is not producing the results back, it seems to work with firefox, so its a little strange.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
# Ability to run headless
from selenium.webdriver.firefox.options import Options as f_Options
from selenium.webdriver.chrome.options import Options as c_Options
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
# This allows you to download the page
from parsel import Selector
import time
import datetime
import os
class headlessbypass:
my_date_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
def my_set_up(self):
"""Executed before running, i.e. opening browser"""
# This is required for running on the pipeline
headless = os.getenv('HEADLESS_MODE')
def firefox_headless_func():
self.options = f_Options()
self.options.headless = True
binary = FirefoxBinary('c:/Users/Anish/AppData/Local/Mozilla Firefox/firefox.exe')
self.driver = webdriver.Firefox(firefox_binary=binary, executable_path='bin/geckodriver.exe', options=self.options)#, options=self.options, executable_path='bin/geckodriver.exe')
def chrome_headless_func():
self.options = c_Options()
#self.options.headless = True
self.options.add_argument("--window-size=1920, 1080")
#self.options.add_argument("--disable-extensions")
#self.options.add_argument("--proxy-server='direct://'")
#self.options.add_argument("--proxy-bypass-list=*")
#self.options.add_argument("--start-maximized")
self.options.add_argument('--headless')
self.options.add_argument('--disable-gpu')
#self.options.add_argument('--disable-dev-shm-usage')
#self.options.add_argument('--no-sandbox')
#self.options.add_argument('--ignore-certificate-errors')
#self.options.add_argument("--allow-insecure-localhost")
#self.options.add_argument("--allow-running-insecure-content")
#self.options.add_argument('--disable-browser-side-navigation')
self.options.add_argument("--enable-javascript")
self.options.add_argument("--user-agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) Gecko/20100101 Firefox/72.0'")
#self.options.binary_location = "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe"
self.driver = webdriver.Chrome(options=self.options, executable_path='bin/chromedriver')
# This is for running locally; select/toggle what you want to run
headless_firefox = 0
headless_chrome = 0
chrome = 1
safari = 0
if headless:
firefox_headless_func()
else:
if headless_firefox:
firefox_headless_func()
elif headless_chrome:
chrome_headless_func()
elif chrome:
self.driver = webdriver.Chrome(executable_path='bin/chromedriver.exe')
else:
self.driver = webdriver.Firefox(executable_path='bin/geckodriver.exe')
self.driver.implicitly_wait(30)
self.driver.maximize_window()
main_window = self.driver.current_window_handle
self.driver.switch_to.window(main_window)
def my_tear_down(self):
"""Executed after running, i.e. closing browser"""
self.driver.quit()
def my_decorator(func):
"""my_set_up and my_tear_down decorator, so that my_set_up is run before and my_tear_down is run after"""
def wrapper(self, *args, **kwargs):
self.my_set_up()
func(self, *args, **kwargs)
self.my_tear_down()
return wrapper
#my_decorator
def visit_site(self):
"""Extract quotes"""
self.driver.get("https://mygocompare.gocompare.com/newcustomer/")
time.sleep(2)
print(self.driver.page_source)
# Enter registration number
reg_field = self.driver.find_element(By.XPATH, "//fieldset[1]/div[2]/div[2]/div/input")
reg_field.send_keys("AK47")
time.sleep(5)
print("Take screenshot")
html = self.driver.find_element_by_tag_name('html')
html.send_keys(Keys.PAGE_UP)
self.driver.save_screenshot("csv_json_files/firstpagescreenshot.png")
self.driver.find_element(By.XPATH, "//span[contains(text(), 'Find car')]").click()
time.sleep(2)
print("Take screenshot")
html = self.driver.find_element_by_tag_name('html')
html.send_keys(Keys.PAGE_UP)
self.driver.save_screenshot("csv_json_files/firstpagescreenshot2.png")
if __name__ == '__main__':
start_time = time.time()
scrape = headlessbypass()
scrape.visit_site()

Selenium Web Scraping Id missing

I am trying to gather the data from the http://maharain.gov.in/ site. I have written the below script.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
import csv,sys,os
from bs4 import BeautifulSoup
import time
def check_exists_by_xpath(xpath,driver):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
chromepath= '/home/swapnil/Downloads/chromedriver'
driver = webdriver.Chrome(chromepath)
start_time =time.time()
base_url = "http://maharain.gov.in/"
driver.get(base_url)
driver.switch_to.frame('MenuFrame')
driver.find_element_by_name("QueriesCirclewise3").click()
time.sleep(3)
print("Done")
driver.find_element(By.XPATH, '//*[#id="menu"]/input[10]').click()
time.sleep(3)
print("Done")
# driver.find_element_by_name("PastQueriesCirclewise6").click()
time.sleep(3)
print("Done")
# driver.implicitly_wait(3)
driver.switch_to.default_content()
time.sleep(3)
print("Done")
driver.switch_to.frame(driver.find_element_by_name("ContentFrame"))
# dropdown_menu_year = Select(driver.find_element_by_id("selyear"))
# select_year = [option.text for option in dropdown_menu_year.options]
# select_year = [t for t in select_year if t !='Select']
# select_year = [ '2015', '2016', '2017']
time.sleep(3)
print("Done All ")
dropdown_menu_state = Select(driver.find_element_by_id("selstate"))
select_state = [option.text for option in dropdown_menu_state.options]
select_state = [t for t in select_state if t !='Select']
dropdown_menu_dist = Select(driver.find_element_by_id("seldist"))
select_dist = [option.text for option in dropdown_menu_dist.options]
select_dist = [t for t in select_dist if t !='Select']
dropdown_menu_month = Select(driver.find_element_by_id("selmonth"))
select_mon = [option.text for option in dropdown_menu_month.options]
select_mon = [t for t in select_mon if t !='Select']
i = 0
year=str(2018)
# for year in select_year:
if not os.path.exists(year):
os.makedirs(year)
for state in select_state:
for dist in select_dist:
if not os.path.exists(year+'/'+dist):
os.makedirs(year+'/'+dist)
for month in select_mon:
print (i)
# dropdown_menu_year = Select(driver.find_element_by_id("selyear"))
# dropdown_menu_year.select_by_visible_text(year)
dropdown_menu_state = Select(driver.find_element_by_id("selstate"))
dropdown_menu_state.select_by_visible_text(state)
time.sleep(1)
dropdown_menu_dist = Select(driver.find_element_by_id("seldist"))
dropdown_menu_dist.select_by_visible_text(dist)
if(dist=='Wardha' or dist=='Washim' or dist=='Yavatmal'):
# time.sleep(2)
dropdown_menu_month = Select(driver.find_element_by_id("selmonth"))
dropdown_menu_month.select_by_visible_text(month)
time.sleep(2)
driver.find_element_by_name("btnshow").click()
time.sleep(2)
print("Done")
driver.switch_to.frame(driver.find_element(By.CSS_SELECTOR, 'body > embed'))
if (check_exists_by_xpath('//*[#id="tableID"]',driver)):
tab = driver.find_element(By.XPATH,'//*[#id="tableID"]')
soup = BeautifulSoup (driver.page_source)
table = soup.select_one('table')
data = [[td.text for td in row.find_all("td")] for row in table.find_all("tr")]
file_name = year+'/'+dist+'/'+year+'_'+dist+'_'+month+'.csv'
print(file_name)
f = open(file_name,'w', newline='')
writer =csv.writer(f)
writer.writerows(data)
f.close()
i+=1
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element_by_name("ContentFrame"))
print ( time.time() - start_time)
print(i)
But each time I run the code it gets stuck at different locations with errors like missing selector id "selstate" or "body > embed" not present, which may run correctly in the next run without any changes to the code and may get stuck at the different location.
I have tried adding driver implicit wait and thread sleep with value set to 5 and less.Please point out what should be the correct measure to make it run in one go and where should be the wait or sleep statements to be added or any other changes if required.

Resources