I'm trying to test out chromedriver and headless chrome on this site.
https://car.gocompare.com/vehicle
However when i try with normal chrome it works fine, I'll get a response for a car reg I've put in.
When I use headless chrome it says car cannot be found.
Does anyone know what could be up with it, is it the driver, or the website that is not producing the results back, it seems to work with firefox, so its a little strange.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
# Ability to run headless
from selenium.webdriver.firefox.options import Options as f_Options
from selenium.webdriver.chrome.options import Options as c_Options
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
# This allows you to download the page
from parsel import Selector
import time
import datetime
import os
class headlessbypass:
my_date_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
def my_set_up(self):
"""Executed before running, i.e. opening browser"""
# This is required for running on the pipeline
headless = os.getenv('HEADLESS_MODE')
def firefox_headless_func():
self.options = f_Options()
self.options.headless = True
binary = FirefoxBinary('c:/Users/Anish/AppData/Local/Mozilla Firefox/firefox.exe')
self.driver = webdriver.Firefox(firefox_binary=binary, executable_path='bin/geckodriver.exe', options=self.options)#, options=self.options, executable_path='bin/geckodriver.exe')
def chrome_headless_func():
self.options = c_Options()
#self.options.headless = True
self.options.add_argument("--window-size=1920, 1080")
#self.options.add_argument("--disable-extensions")
#self.options.add_argument("--proxy-server='direct://'")
#self.options.add_argument("--proxy-bypass-list=*")
#self.options.add_argument("--start-maximized")
self.options.add_argument('--headless')
self.options.add_argument('--disable-gpu')
#self.options.add_argument('--disable-dev-shm-usage')
#self.options.add_argument('--no-sandbox')
#self.options.add_argument('--ignore-certificate-errors')
#self.options.add_argument("--allow-insecure-localhost")
#self.options.add_argument("--allow-running-insecure-content")
#self.options.add_argument('--disable-browser-side-navigation')
self.options.add_argument("--enable-javascript")
self.options.add_argument("--user-agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) Gecko/20100101 Firefox/72.0'")
#self.options.binary_location = "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe"
self.driver = webdriver.Chrome(options=self.options, executable_path='bin/chromedriver')
# This is for running locally; select/toggle what you want to run
headless_firefox = 0
headless_chrome = 0
chrome = 1
safari = 0
if headless:
firefox_headless_func()
else:
if headless_firefox:
firefox_headless_func()
elif headless_chrome:
chrome_headless_func()
elif chrome:
self.driver = webdriver.Chrome(executable_path='bin/chromedriver.exe')
else:
self.driver = webdriver.Firefox(executable_path='bin/geckodriver.exe')
self.driver.implicitly_wait(30)
self.driver.maximize_window()
main_window = self.driver.current_window_handle
self.driver.switch_to.window(main_window)
def my_tear_down(self):
"""Executed after running, i.e. closing browser"""
self.driver.quit()
def my_decorator(func):
"""my_set_up and my_tear_down decorator, so that my_set_up is run before and my_tear_down is run after"""
def wrapper(self, *args, **kwargs):
self.my_set_up()
func(self, *args, **kwargs)
self.my_tear_down()
return wrapper
#my_decorator
def visit_site(self):
"""Extract quotes"""
self.driver.get("https://mygocompare.gocompare.com/newcustomer/")
time.sleep(2)
print(self.driver.page_source)
# Enter registration number
reg_field = self.driver.find_element(By.XPATH, "//fieldset[1]/div[2]/div[2]/div/input")
reg_field.send_keys("AK47")
time.sleep(5)
print("Take screenshot")
html = self.driver.find_element_by_tag_name('html')
html.send_keys(Keys.PAGE_UP)
self.driver.save_screenshot("csv_json_files/firstpagescreenshot.png")
self.driver.find_element(By.XPATH, "//span[contains(text(), 'Find car')]").click()
time.sleep(2)
print("Take screenshot")
html = self.driver.find_element_by_tag_name('html')
html.send_keys(Keys.PAGE_UP)
self.driver.save_screenshot("csv_json_files/firstpagescreenshot2.png")
if __name__ == '__main__':
start_time = time.time()
scrape = headlessbypass()
scrape.visit_site()
Related
Why am I getting this? How do I give permission for my server to access it?
Every time I run:
action = browser.find_element_by_xpath('/html/body/div[1]/header[1]/div/div[1]/div[2]/a[1]')
action.click()
Result in: [1128/225708.842:ERROR:web_contents_delegate.cc(239)] WebContentsDelegate::CheckMediaAccessPermission: Not supported.
I am using uvicorn to run the server locally.
This file is the entry point for the server: main.py
from fastapi import FastAPI
import TrelloBoard
app = FastAPI()
#app.get("/test")
def read_item():
return TrelloBoard.trello_board_main()
This File is the logic to run selenium to go to the Trello website: TrelloBoard.py
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as Options_firefox
from selenium.webdriver.chrome.options import Options as Options_chrome
from selenium.common.exceptions import NoSuchElementException
import time
from datetime import datetime
import linecache
import traceback
import os
from os.path import join, dirname
from dotenv import load_dotenv
script_config_file = "trello-boards-info"
log_file = "trello-scraping.log"
dotenv_path = join(dirname(__file__), '.env')
load_dotenv(dotenv_path)
def log(print_this):
# print to console
print(print_this)
# print to file
print(print_this, file=open(log_file, "a"))
pass
def setup():
global browser
log("timestamp: %s" % datetime.now())
# first line in file contains config for script
settings = read_data_from_file(script_config_file, 1).split(';')
# get driver and headless setting, make reading all lower case to avoid caps issues
driver_setting = settings[0].lower().split('driver=')[1]
headless_setting = settings[1].lower().split('headless=')[1]
# Configure firefox
if driver_setting == 'firefox':
DRIVER_PATH = './geckodriver.exe'
firefox_options = Options_firefox()
if headless_setting == 'true':
firefox_options.headless = True
else:
firefox_options.headless = False
browser = webdriver.Firefox(executable_path=DRIVER_PATH, options=firefox_options)
# Configure chrome
elif driver_setting == "chrome":
DRIVER_PATH = './chromedriver.exe'
chrome_options = Options_chrome()
if headless_setting == 'true':
chrome_options.add_argument("--headless")
# need to add this otherwise will occassionally get error 'element not interactable'
chrome_options.add_argument("--window-size=1920,1080")
else:
chrome_options.add_argument("--None")
browser = webdriver.Chrome(executable_path=DRIVER_PATH, options=chrome_options)
else:
driver_setting = "unrecognised driver"
log("Driver = %s, Headless mode = %s" % (driver_setting, headless_setting))
pass
def trello2():
browser.get('https://trello.com')
log("Go to site: %s" % browser.title)
# read file for login info
my_email = os.environ.get('TRELLO_BOARD_USERNAME')
my_pass = os.environ.get('TRELLO_BOARD_PASSWORD')
log("from file: my_email: %s, my_pass: ***" % my_email)
# find login
action = browser.find_element_by_xpath('/html/body/div[1]/header[1]/div/div[1]/div[2]/a[1]')
action.click()
browser.close()
return "trello"
def trello_board_main():
try:
log("--- start program ---")
setup()
jsonData = trello2()
return jsonData
except Exception as print_error:
# TODO: how to combine print to file and stderr?
traceback.print_exception(type(print_error), print_error, print_error.__traceback__, file=open("trello-scraping.log", "a"))
traceback.print_exception(type(print_error), print_error, print_error.__traceback__)
return print_error
This File is the config for the chromdrive: trello-board-info
driver=chrome;headless=true
I was hoping someone could take a look at my code and explain to me why I am seeing a runaway memory issue in chrome.exe processes. When I run the program everything seems stable for a few hours, but after around 8 hours I will have a single chrome.exe process that consumes around 5Gb of memory. The application is fairly simple. for each item that I want to search, a new process is created. Inside that process I create a single driver instance and then search for an element. If the element isn't present then I refresh the driver and continue searching. Here is a generic sample of my code.
import time
from multiprocessing import Process
import datetime as dt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions as SE
import sendMail
class itemSearch(Process):
def __init__(self, item):
Process.__init__(self)
self.item = item
print("Starting Search for: "+str(self.item))
self.start()
def run(self):
"""For some reason multiprocessing was not allowing me to put the driver initializations outside of the run function. In threading I was able to pass the driver to init. Kept getting a PermissionError: [WinError 5] Access is denied. Putting the driver initialization into the run function seems to have fixed this issue. No fucking clue."""
options = Options()
options.add_experimental_option("detach",True)
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, timeout=20)
self.session = self.driver.session_id
self.driver.get(self.item)
#self.done = False
while not self.done:
self.search()
self.driver.close()
def search(self):
while True:
try:
print("Scanning for: "+str(self.item))
self.driver.find_element_by_xpath('//div[some xpath to a button]').click()
print("sending email")
url = self.driver.current_url
sendMail.sendNotification(receiver_email="yourmail.com", url=url)
break
except SE.NoSuchElementException:
print("Refreshing")
self.driver.refresh()
print(dt.datetime.now())
self.wait.until(EC.visibility_of_element_located((By.XPATH,'//div[some other xpath]')))
self.done = True
if __name__ == '__main__':
url1 = "https://www.somesite.com"
url2= "https://www.someothersite.com
searchItems = [url1, url2]
print("Starting search")
for item in searchItems:
print(item)
itemSearch(item)
As a work-around I added a function that check memory usage for all chrome.exe processes. The check is run on each loop iteration. I have set a max memory limit and once that limit is reached I close the chrome driver and make a call to the run function again. This is actually working very well for me. Here's the new code with the function incorporated:
import time
import psutil
from multiprocessing import Process
import datetime as dt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions as SE
import sendMail
class itemSearch(Process):
def __init__(self, item):
Process.__init__(self)
self.item = item
print("Starting Search for: "+str(self.item))
self.start()
def run(self):
"""For some reason multiprocessing was not allowing me to put the driver initializations outside of the run function. In threading I was able to pass the driver to init. Kept getting a PermissionError: [WinError 5] Access is denied. Putting the driver initialization into the run function seems to have fixed this issue. No fucking clue."""
options = Options()
options.add_experimental_option("detach",True)
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, timeout=20)
self.session = self.driver.session_id
self.driver.get(self.item)
#self.done = False
while not self.done:
self.search()
self.driver.close()
def getMemoryUsage(self):
"Return the MB of ram being used by chrome."
process_list = []
total_mem = 0
for p in psutil.process_iter(['name']):
if p.info['name'] == "chrome.exe":
process_list.append(p.pid)
#Calculate total memory usage
for pid in process_list:
try:
#logger.info(str(pid)+" = "+str(psutil.Process(pid).memory_info().private/1000000))
total_mem += psutil.Process(pid).memory_info().private
except psutil.NoSuchProcess:
#logger.info("Process "+str(pid)+" not present")
pass
return total_mem/1000000
def search(self):
while True:
try:
print("Scanning for: "+str(self.item))
self.driver.find_element_by_xpath('//div[some xpath to a button]').click()
print("sending email")
url = self.driver.current_url
sendMail.sendNotification(receiver_email="yourmail.com", url=url)
break
except SE.NoSuchElementException:
print("Refreshing")
self.driver.refresh()
print(dt.datetime.now())
self.wait.until(EC.visibility_of_element_located((By.XPATH,'//div[some other xpath]')))
memUsage = self.getMemoryUsage()
print("Current Memory Usage at: "+str(memUsage)+"MB")
if memUsage > 7000:
#print("Memory Usage reached " +str(memUsage) +"MB. Restarting driver")
logger.info("Memory Usage reached " +str(memUsage) +"MB. Restarting driver")
self.driver.quit()
self.run()
self.done = True
if __name__ == '__main__':
url1 = "https://www.somesite.com"
url2= "https://www.someothersite.com
searchItems = [url1, url2]
print("Starting search")
for item in searchItems:
print(item)
itemSearch(item)
I am trying to scrape price from booking.com but not successful. Any suggestions
My code as follows
#Importing necessary library
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
price = []
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ&lang=en-gb&sid=422b3ff3c0e98b522259ad1cad2505ea&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ%3Bsid%3D422b3ff3c0e98b522259ad1cad2505ea%3Btmpl%3Dsearchresults%3Bclass_interval%3D1%3Bdest_id%3D-1506909%3Bdest_type%3Dcity%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D2%3Bgroup_children%3D0%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dcity%3Broom1%3DA%252CA%3Bsb_price_type%3Dtotal%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bsrpvid%3D912403b6d1220012%3Bss%3DAuckland%3Bss_all%3D0%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DAuckland%3Bssne_untouched%3DAuckland%3Btop_ufis%3D1%26%3B&sr_autoscroll=1&ss=Auckland&is_ski_area=0&ssne=Auckland&ssne_untouched=Auckland&city=-1506909&checkin_year=2020&checkin_month=9&checkin_monthday=1&checkout_year=2020&checkout_month=9&checkout_monthday=2&group_adults=2&group_children=0&no_rooms=1&from_sf=1')
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
for item in soup.findAll('label', {'class': "tpi_price_label tpi_price_label__orange"}):
price.append(item.get_text(strip=True))
print(price)
The above code is not showing any output. It gives an empty list.
You need properly wait for the page to load.
This is done using WebDriverWait and it will throw exception if the page isnt loaded during the specified timeout.
Try running my sample code bellow:
# test_scrape.py
import atexit
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
URL = ("https://www.booking.com/searchresults.en-gb.html?"
"label=gen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ"
"&lang=en-gb&sid=422b3ff3c0e98b522259ad1cad2505ea&sb=1&src=searchresults&src_elem=sb"
"&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-"
"1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ%3Bsid%3D422b3ff"
"3c0e98b522259ad1cad2505ea%3Btmpl%3Dsearchresults%3Bclass_interval%3D1%3Bdest_id%3D-150690"
"9%3Bdest_type%3Dcity%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D2%3Bgroup_children%3D0%3"
"Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcar"
"d%3D0%3Braw_dest_type%3Dcity%3Broom1%3DA%252CA%3Bsb_price_type%3Dtotal%3Bshw_aparth%3D1%3Bs"
"lp_r_match%3D0%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bsrpvid%3D912403b6d1220012%3Bss%3DAuckland%3B"
"ss_all%3D0%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DAuckland%3Bssne_untouched%3DAuckland%3Btop_ufi"
"s%3D1%26%3B&sr_autoscroll=1&ss=Auckland&is_ski_area=0&ssne=Auckland&ssne_untouched=Auckland&ci"
"ty=-1506909&checkin_year=2020&checkin_month=9&checkin_monthday=1&checkout_year=2020&checkout_m"
"onth=9&checkout_monthday=2&group_adults=2&group_children=0&no_rooms=1&from_sf=1'")
class page_loaded:
def __call__(self, driver):
document_ready = driver.execute_script("return document.readyState;") == "complete"
jquery_ready = driver.execute_script("return jQuery.active == 0;")
print(f"document ready: [({type(document_ready).__name__}){document_ready}]")
print(f"jquery ready: [({type(jquery_ready).__name__}){jquery_ready}]")
return document_ready and jquery_ready
def wait_for_page_to_load(driver, timeout_seconds=20):
WebDriverWait(driver, timeout_seconds, 0.2).until(page_loaded(), f"Page could not load in {timeout_seconds} s.!")
def go_to_url(driver, url):
driver.get(url)
wait_for_page_to_load(driver)
def get_orange_prices(soup):
return [price_label.get_text(strip=True)
for price_label
in soup.select("label.tpi_price_label.tpi_price_label__orange")]
def get_normal_prices(soup):
return [price_label.get_text(strip=True)
for price_label
in soup.select("div[class*=bui-price-display__value]")]
def start_driver():
driver = webdriver.Chrome()
atexit.register(driver.quit)
driver.maximize_window()
return driver
def main():
driver = start_driver()
go_to_url(driver, URL)
soup = BeautifulSoup(driver.page_source, 'html.parser')
orange_prices = get_orange_prices(soup)
print(orange_prices)
normal_prices = get_normal_prices(soup)
print(normal_prices)
if __name__ == '__main__':
main()
If you're having issues with the chromedriver not being discovered, try specify exact path to it like this:
def start_driver():
driver = webdriver.Chrome(executable_path="/path/to/cromedriver")
atexit.register(driver.quit)
driver.maximize_window()
return driver
I am trying to deploy my scrapy-spider on my local host through scrapyd, The spider script contains the selenium module to perform some web automated tasks. the problem arises when I try to deploy it.
After running the scrapyd from command line to run the local host. I type the local host address on my browser and its online and listening. Then type in the the scrapyd-deploy command on another cmd window and its gets stuck like this for hours(no error msg).
$ scrapyd-deploy local
Packing version 1560251984
Deploying to project "crawler" in http://localhost:6800/addversion.json
stuck image description
I'm using gitbash on my windows machine by the way, I've also tried using normal cmd but it still the same endless wait and delay.
On the first cmd window where i run the scrapyd command to open the local host, i get something like this.
DevTools listening on ws://127.0.0.1:9137/devtools/browser/07f179fa-02ce-4b31-a5
96-9b700654f105
devtools listening stuck image
To my understanding that seems to be the selenium browser in headless mode trying to initiate but it keeps waiting endlessly.
When i open my project directory, i see new folders like eggs, project-egg info and build. It seems it eggifies the project but runs through a delay when trying to deploy and run it on local host.
this is my spider script
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait as Webwait
from selenium.webdriver.support import expected_conditions as exco
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import random
C_options = Options()
C_options.add_argument("--disable-extensions")
C_options.add_argument("--disable-gpu")
C_options.add_argument("--headless")
class CloupAppSpider(scrapy.Spider):
driver = webdriver.Chrome(options=C_options,
executable_path=r"C:\.....\chromedriver.exe")
driver.get("https://scrapingclub.com/exercise/basic_login/")
cookie = driver.get_cookies()
driver.add_cookie(cookie[0])
name = 'crawl'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/basic_login/']
def __int__(self, name=None, passwd=None, *args, **kwargs):
super(CloupAppSpider, self).__init__(*args, **kwargs)
self.passwd = passwd
self.name = name
def parse(self, response):
pword = self.passwd
uname = self.name
Webwait(self.driver, 10).until(exco.presence_of_element_located((By.ID, "id_name")))
Webwait(self.driver, 10).until(exco.presence_of_element_located((By.ID, "id_password")))
CloupAppSpider.driver.find_element_by_id("id_name").send_keys(pword)
CloupAppSpider.driver.find_element_by_id("id_password").send_keys(uname)
CloupAppSpider.driver.find_element_by_css_selector(".btn.btn-primary").click()
Webwait(self.driver, 10).until(exco.presence_of_element_located((By.CLASS_NAME, "col-lg-8")))
html = CloupAppSpider.driver.execute_script("return document.documentElement.outerHTML")
bs_obj = BeautifulSoup(html, "html.parser")
text = bs_obj.find("div", {"class": "col-lg-8"}).find("p")
obj = text.get_text()
obj = obj + str(random.randint(0, 100))
self.driver.close()
yield{
'text': obj
}
This is my scrapy.cfg content
[settings]
default = crawler.settings
[deploy:local]
url = http://localhost:6800/
project = crawler
Can someone help explain to me where i went wrong, I am clueless as i didn't get any error code when deploying. it just keeps waiting endlessly, my guess is when it's trying to process with selenium.
I solved it. my mistake was that, I initiated the selenium process under the main spider class, instead of under the def parse function contained in the spider class. I just re-edited the code as follows.
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait as Webwait
from selenium.webdriver.support import expected_conditions as exco
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import random
C_options = Options()
C_options.add_argument("--disable-extensions")
C_options.add_argument("--disable-gpu")
C_options.add_argument("--headless")
class CloupAppSpider(scrapy.Spider):
name = 'crawl'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/basic_login/']
def __int__(self, name=None, passwd=None, *args, **kwargs):
super(CloupAppSpider, self).__init__(*args, **kwargs)
self.passwd = passwd
self.name = name
def parse(self, response):
driver = webdriver.Chrome(options=C_options,
executable_path=r"C:\......\chromedriver.exe")
driver.get("https://scrapingclub.com/exercise/basic_login/")
cookie = driver.get_cookies()
driver.add_cookie(cookie[0])
pword = self.passwd
uname = self.name
Webwait(self.driver, 10).until(exco.presence_of_element_located((By.ID, "id_name")))
Webwait(self.driver, 10).until(exco.presence_of_element_located((By.ID, "id_password")))
CloupAppSpider.driver.find_element_by_id("id_name").send_keys(pword)
CloupAppSpider.driver.find_element_by_id("id_password").send_keys(uname)
CloupAppSpider.driver.find_element_by_css_selector(".btn.btn-primary").click()
Webwait(self.driver, 10).until(exco.presence_of_element_located((By.CLASS_NAME, "col-lg-8")))
html = CloupAppSpider.driver.execute_script("return document.documentElement.outerHTML")
bs_obj = BeautifulSoup(html, "html.parser")
text = bs_obj.find("div", {"class": "col-lg-8"}).find("p")
obj = text.get_text()
obj = obj + str(random.randint(0, 100))
self.driver.close()
yield{
'text': obj
}
This question already has answers here:
Selenium : How to stop geckodriver process impacting PC memory, without calling driver.quit()?
(1 answer)
PhantomJS web driver stays in memory
(1 answer)
Closed 3 years ago.
I wrote some code in python using selenium and multiprocessing to parallelize data collection. I am collecting some data from YouTube. I have a method which initiates a chrome webdriver. I used multiprocessing to collect data faster. The issue is that when the timeout for the multiprocessing is reached, the function with the chromedriver exits the function before driver.quit() command can register. This leads to the accumulation of idle chromedrivers which I cannot close within python since (to my knowledge) there is no way to reference them. Is there any way to close all chromedrivers without explicitly using the driver objects?
I wrote the code in python3. The chromedriver is Chrome version 72.
# Web related modules
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from urllib.request import urlopen
import html2text
# YouTube download module
from pytube import YouTube
# Multiprocessing tools
from multiprocessing import Lock, Manager, Queue, Pool
import multiprocessing as mp
# Misc modules
import time, re, pickle, os, shutil, argparse, glob, unicodedata, datetime
from argparse import RawTextHelpFormatter
# Irrelevant to the problem
def save_vids(vid_ids,save_loc):
print('Irrelevant Function')
# Function that generates the initial list of urls to visit
def explore_home(chromedriver_path,chrome_options,caps):
driver=webdriver.Chrome(executable_path=chromedriver_path,options=chrome_options,desired_capabilities=caps)
driver.get('https://www.youtube.com')
time.sleep(1)
html_source = driver.page_source
driver.close()
parts=html_source.split('{"webCommandMetadata":{"url":"/watch_videos?')[1:]
vids=[]
for part in parts:
part=part[part.find('video_ids=')+10:]
if part.find('\\u')!=-1:
if part.find('"')!=-1:
end=min(part.find('\\u'),part.find('"'))
else:
end=part.find('\\u')
elif part.find('"')!=-1:
end=part.find('"')
else:
print('fuck')
concat_list=part[:end]
vids.extend(concat_list.split('%2C'))
vids=[vid for vid in vids if len(re.findall(r'[0-9]|[a-z]|[A-Z]|_|-',vid))==11 and len(vid)==11]
return vids
# The function that generates chromedrivers and fails to quit if a multiprocessing timeout occurs.
def explore_vid(chromedriver_path,chrome_options,caps,vid,ads,save_loc,l):
driver=webdriver.Chrome(executable_path=chromedriver_path,options=chrome_options,desired_capabilities=caps)
driver.get('https://www.youtube.com/watch?v='+vid)
time.sleep(2)
sec_html = driver.page_source
soup=BeautifulSoup(sec_html,'lxml')
mydivs = str(soup.findAll("div", {"class": "style-scope ytd-watch-next-secondary-results-renderer"}))
inds=[m.start() for m in re.finditer('ytimg.com/vi/', mydivs)]
rec_vids=['https://www.youtube.com/watch?v='+mydivs[ind+13:ind+24] for ind in inds]
browser_log = driver.get_log('performance')
adInfo=find_ad(browser_log,vid)
if adInfo:
#Check if it is the first time this ad has been seen
adID=adInfo[0]
l.acquire()
try:
if adID in ads:
ads[adID][0].append(adInfo[1])
else:
try:
element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".ytp-ad-button.ytp-ad-visit-advertiser-button.ytp-ad-button-link")))
element.click()
driver.switch_to.window(driver.window_handles[-1])
ad_website_URL=driver.current_url
ad_website_HTML=driver.page_source
clean_text=html2text.html2text(ad_website_HTML)
save_vids(adID,save_loc)
textName=os.path.join(save_loc,adID,'adwebsite.txt')
file = open(textName,"w")
file.write(ad_website_URL)
file.write('\n')
file.write(clean_text)
file.close()
ads[adID]=[[adInfo[1]],ad_website_URL]
except WebDriverException:
print('Button click failed: %s:%s' %(vid,adInfo[0]))
finally:
l.release()
# The quit command for the chrome driver
driver.quit()
return rec_vids
def find_ad(browser_log,vid):
for k in range(len(browser_log)):
if browser_log[k]['message'].find('adunit')!=-1 and browser_log[k]['message'].find(vid)!=-1:
ind=browser_log[k]['message'].find('https://www.youtube.com/get_video_info?html5=1&video_id=')
vid_id=browser_log[k]['message'][ind+56:ind+67]
return (vid_id,time.localtime())
return None
def positive_int(argument):
num=int(argument)
if num<1:
msg="Maximum depth parameter must be a positive number. You entered: %s" %argument
raise argparse.ArgumentTypeError(msg)
return num
def valid_pickle(argument):
file=str(argument)
if not file.endswith('.pickle'):
msg="ad_save_loc must end with .pickle You entered: %s" %file
raise argparse.ArgumentTypeError(msg)
return file
def valid_dir(argument):
directory=str(argument)
if not os.path.isdir(directory):
msg="vid_save_loc must be a valid directory. You entered: %s" %directory
raise argparse.ArgumentTypeError(msg)
return directory
if __name__ == '__main__':
# Argument Parsing
parser = argparse.ArgumentParser(description='Scrapes Youtube ads and advertising company websites. \nUse --restart to restart the scraping from scratch by deleting previous data\nExample Usage: python finalReader.py E:\ads\ads.pickle E:\ads --ncpu 2', formatter_class=RawTextHelpFormatter)
parser.add_argument('ad_save_loc',help='Save Location for Ad Main Dictionary', type=valid_pickle)
parser.add_argument('vid_save_loc',help='Save Location for Ad Videos', type=valid_dir)
parser.add_argument('chromedriver_path', help='Path of the chrome executable', type=str)
parser.add_argument('--restart', help='Restart collection', action="store_true", default=False, dest='restartCollection')
parser.add_argument('--ncpu', nargs='?', help='Number of cores for multiprocessing, 1 by default', default=1, type=int, dest='mpcpu')
parser.add_argument('--timeout',nargs='?', help='For how long the data collection will take place (in seconds), infinite by default', default=float('inf'), type=float, dest='time_limit')
parser.add_argument('--max_depth', nargs='?', help='Depth of Youtube exploration tree', default=1, type=positive_int, dest='search_depth')
args = parser.parse_args()
ad_save_loc=args.ad_save_loc
vid_save_loc=args.vid_save_loc
vid_save_loc=os.path.join(vid_save_loc,'ad_data')
mpcpu=max(args.mpcpu,1)
time_limit=args.time_limit
chromedriver_path=args.chromedriver_path
search_depth=args.search_depth
if not os.path.isdir(vid_save_loc):
os.mkdir(vid_save_loc)
if args.restartCollection:
for the_file in os.listdir(vid_save_loc):
file_path = os.path.join(vid_save_loc, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(e)
if os.path.isfile(ad_save_loc):
os.remove(ad_save_loc)
ads={}
else:
if os.path.isfile(ad_save_loc):
pickle_in = open(ad_save_loc,"rb")
ads = pickle.load(pickle_in)
else:
ads={}
# Chrome Driver Options
chrome_options=Options()
chrome_options.add_argument('--mute-audio')
caps = DesiredCapabilities.CHROME
caps['loggingPrefs'] = {'performance': 'ALL'}
startTime=time.time()
currentTime=time.time()
# Data Collection Loop - Multiprocessing
while currentTime-startTime<time_limit:
print('Time from start: %s' %str(datetime.timedelta(seconds=currentTime-startTime)))
rec_vids=explore_home(chromedriver_path,chrome_options,caps)
while not rec_vids:
time.sleep(60)
rec_vids=explore_home(chromedriver_path,chrome_options,caps)
m = Manager()
lock = m.Lock()
pool = Pool(processes=mpcpu)
for depth in range(search_depth):
print('Depth %s' %depth)
multiple_results=[pool.apply_async(explore_vid, (chromedriver_path,chrome_options,caps,vid,ads,vid_save_loc,lock)) for vid in rec_vids]
branching_vids=[]
for res in multiple_results:
try:
branching_vids.append(res.get(timeout=30))
if time.time()-startTime<time_limit:
break
except mp.TimeoutError:
print('Timeout')
res_vids=branching_vids.copy()
pickle_out = open(ad_save_loc,"wb")
pickle.dump(ads, pickle_out)
pickle_out.close()
currentTime=time.time()