Why am I getting this? How do I give permission for my server to access it?
Every time I run:
action = browser.find_element_by_xpath('/html/body/div[1]/header[1]/div/div[1]/div[2]/a[1]')
action.click()
Result in: [1128/225708.842:ERROR:web_contents_delegate.cc(239)] WebContentsDelegate::CheckMediaAccessPermission: Not supported.
I am using uvicorn to run the server locally.
This file is the entry point for the server: main.py
from fastapi import FastAPI
import TrelloBoard
app = FastAPI()
#app.get("/test")
def read_item():
return TrelloBoard.trello_board_main()
This File is the logic to run selenium to go to the Trello website: TrelloBoard.py
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as Options_firefox
from selenium.webdriver.chrome.options import Options as Options_chrome
from selenium.common.exceptions import NoSuchElementException
import time
from datetime import datetime
import linecache
import traceback
import os
from os.path import join, dirname
from dotenv import load_dotenv
script_config_file = "trello-boards-info"
log_file = "trello-scraping.log"
dotenv_path = join(dirname(__file__), '.env')
load_dotenv(dotenv_path)
def log(print_this):
# print to console
print(print_this)
# print to file
print(print_this, file=open(log_file, "a"))
pass
def setup():
global browser
log("timestamp: %s" % datetime.now())
# first line in file contains config for script
settings = read_data_from_file(script_config_file, 1).split(';')
# get driver and headless setting, make reading all lower case to avoid caps issues
driver_setting = settings[0].lower().split('driver=')[1]
headless_setting = settings[1].lower().split('headless=')[1]
# Configure firefox
if driver_setting == 'firefox':
DRIVER_PATH = './geckodriver.exe'
firefox_options = Options_firefox()
if headless_setting == 'true':
firefox_options.headless = True
else:
firefox_options.headless = False
browser = webdriver.Firefox(executable_path=DRIVER_PATH, options=firefox_options)
# Configure chrome
elif driver_setting == "chrome":
DRIVER_PATH = './chromedriver.exe'
chrome_options = Options_chrome()
if headless_setting == 'true':
chrome_options.add_argument("--headless")
# need to add this otherwise will occassionally get error 'element not interactable'
chrome_options.add_argument("--window-size=1920,1080")
else:
chrome_options.add_argument("--None")
browser = webdriver.Chrome(executable_path=DRIVER_PATH, options=chrome_options)
else:
driver_setting = "unrecognised driver"
log("Driver = %s, Headless mode = %s" % (driver_setting, headless_setting))
pass
def trello2():
browser.get('https://trello.com')
log("Go to site: %s" % browser.title)
# read file for login info
my_email = os.environ.get('TRELLO_BOARD_USERNAME')
my_pass = os.environ.get('TRELLO_BOARD_PASSWORD')
log("from file: my_email: %s, my_pass: ***" % my_email)
# find login
action = browser.find_element_by_xpath('/html/body/div[1]/header[1]/div/div[1]/div[2]/a[1]')
action.click()
browser.close()
return "trello"
def trello_board_main():
try:
log("--- start program ---")
setup()
jsonData = trello2()
return jsonData
except Exception as print_error:
# TODO: how to combine print to file and stderr?
traceback.print_exception(type(print_error), print_error, print_error.__traceback__, file=open("trello-scraping.log", "a"))
traceback.print_exception(type(print_error), print_error, print_error.__traceback__)
return print_error
This File is the config for the chromdrive: trello-board-info
driver=chrome;headless=true
Related
I am trying to write a web scraping job through aws lambda(python) and I am getting this error when I execute it.
Error
Message: unknown error: cannot find Chrome binary
How am I running:
I have downloaded chromedriver from this website and zipped code along with below python code.Please let me know if this way or do I need to make any modifications to my code?
https://chromedriver.storage.googleapis.com/index.html?path=111.0.5563.19/
import concurrent.futures
import requests
from selenium import webdriver
import os
import subprocess
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import boto3
from datetime import datetime
def scrape_data():
try:
years = [2023]
states = ["alnb"]
for state in states:
"""Creating s3 connection to write into state folder"""
for year in years:
url = 'https://www.govinfo.gov/app/collection/uscourts/bankruptcy/'+state+'/'+str(year)+'/%7B%22pageSize%22%3A%22100%22%2C%22offset%22%3A%220%22%7D'
options = webdriver.ChromeOptions()
options.add_argument("headless")
driver = webdriver.Chrome(executable_path='./chromedriver', chrome_options=options)
driver.get(url)
elements = WebDriverWait(driver, 2).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "panel-body"))
)
soup = BeautifulSoup(driver.page_source, 'html.parser')
bankruptcy_element = soup.findAll('div',{"class": "panel-collapse collapse in","class": "panel-title","class": "panel-body","class":"panel panel-default","class": "panel-collapse collapse in"})
print("scraping data for state "+state.capitalize() +" for "+str(year).capitalize())
data = []
for i in bankruptcy_element:
for xmlfile in i.findAll('a', href=True):
if ("pdf" in (xmlfile['href'])):
xmlfile['href']=xmlfile['href'].replace(".pdf","/mods.xml")
xmlfile['href']=xmlfile['href'].replace("/pdf","")
xmlfile['href']=xmlfile['href'].replace("/pkg/","/")
xmlfile['href']=xmlfile['href'].replace("/content","")
xmlfile['href']="https://www.govinfo.gov/metadata/granule"+xmlfile['href']
data.append(xmlfile['href'])
return data
except Exception as e:
pass
print(e)
def lambda_handler(event, context):
s3 = boto3.client('s3')
today_date=datetime.today().strftime('%Y-%m-%d')
s3.put_object(Bucket='w-zone', Key='Banktcy/'+today_date+"/xmlfiles.txt", Body=scrape_data())
#
I am execute following code to connect to Hacker News
def connect_to_base(browser, page_number):
base_url = f"https://news.ycombinator.com/news?p={page_number}"
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(base_url)
# wait for table element with id = 'hnmain' to load
# before returning True
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.ID, "hnmain"))
)
return True
except Exception as e:
print(e)
connection_attempts += 1
print(f"Error connecting to {base_url}.")
print(f"Attempt #{connection_attempts}.")
return False
When I run the above script, I am getting the following error:
NameError: name 'hnmain' is not defined
I'm having trouble figuring out why it's saying 'hnmain' is undefined because 'hnmain' is just an ID of a webelement element in the website.
Do I need to parse it another way?
Here is the complete code I am giving you to try. I have executed it on firefox.
Few changes i have made to your code snippet as well.
import os
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
def connect_to_base(browser, page_number):
base_url = 'https://news.ycombinator.com/news?p={}'.format(page_number)
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(base_url)
# wait for table element with id = 'hnmain' to load
# before returning True
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.ID, "hnmain"))
)
return True
except Exception as e:
print(e)
connection_attempts += 1
print("Error connecting to {}".format(base_url))
print("Attempt #{}".format(connection_attempts))
return False
def GetDriver():
if os.path.exists('C:/Program Files (x86)/Mozilla Firefox/firefox.exe'):
binary = FirefoxBinary('C:/Program Files (x86)/Mozilla Firefox/firefox.exe')
else:
binary = FirefoxBinary('C:/Program Files/Mozilla Firefox/firefox.exe')
options = Options()
options.headless = False
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.manager.showWhenStarting", False)
fp.set_preference("browser.download.manager.showAlertOnComplete", False)
fp.set_preference("browser.helperApps.neverAsk.openFile", "application/json")
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/json")
return webdriver.Firefox(firefox_binary=binary, options=options,
service_log_path='c:/logs/gecko.log', firefox_profile=fp)
def launch_browser():
connect_to_base(GetDriver(), 1)
# Call the function
launch_browser()
This code is working fine and giving you the expected results.
I was hoping someone could take a look at my code and explain to me why I am seeing a runaway memory issue in chrome.exe processes. When I run the program everything seems stable for a few hours, but after around 8 hours I will have a single chrome.exe process that consumes around 5Gb of memory. The application is fairly simple. for each item that I want to search, a new process is created. Inside that process I create a single driver instance and then search for an element. If the element isn't present then I refresh the driver and continue searching. Here is a generic sample of my code.
import time
from multiprocessing import Process
import datetime as dt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions as SE
import sendMail
class itemSearch(Process):
def __init__(self, item):
Process.__init__(self)
self.item = item
print("Starting Search for: "+str(self.item))
self.start()
def run(self):
"""For some reason multiprocessing was not allowing me to put the driver initializations outside of the run function. In threading I was able to pass the driver to init. Kept getting a PermissionError: [WinError 5] Access is denied. Putting the driver initialization into the run function seems to have fixed this issue. No fucking clue."""
options = Options()
options.add_experimental_option("detach",True)
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, timeout=20)
self.session = self.driver.session_id
self.driver.get(self.item)
#self.done = False
while not self.done:
self.search()
self.driver.close()
def search(self):
while True:
try:
print("Scanning for: "+str(self.item))
self.driver.find_element_by_xpath('//div[some xpath to a button]').click()
print("sending email")
url = self.driver.current_url
sendMail.sendNotification(receiver_email="yourmail.com", url=url)
break
except SE.NoSuchElementException:
print("Refreshing")
self.driver.refresh()
print(dt.datetime.now())
self.wait.until(EC.visibility_of_element_located((By.XPATH,'//div[some other xpath]')))
self.done = True
if __name__ == '__main__':
url1 = "https://www.somesite.com"
url2= "https://www.someothersite.com
searchItems = [url1, url2]
print("Starting search")
for item in searchItems:
print(item)
itemSearch(item)
As a work-around I added a function that check memory usage for all chrome.exe processes. The check is run on each loop iteration. I have set a max memory limit and once that limit is reached I close the chrome driver and make a call to the run function again. This is actually working very well for me. Here's the new code with the function incorporated:
import time
import psutil
from multiprocessing import Process
import datetime as dt
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions as SE
import sendMail
class itemSearch(Process):
def __init__(self, item):
Process.__init__(self)
self.item = item
print("Starting Search for: "+str(self.item))
self.start()
def run(self):
"""For some reason multiprocessing was not allowing me to put the driver initializations outside of the run function. In threading I was able to pass the driver to init. Kept getting a PermissionError: [WinError 5] Access is denied. Putting the driver initialization into the run function seems to have fixed this issue. No fucking clue."""
options = Options()
options.add_experimental_option("detach",True)
self.driver = webdriver.Chrome(options=options)
self.wait = WebDriverWait(self.driver, timeout=20)
self.session = self.driver.session_id
self.driver.get(self.item)
#self.done = False
while not self.done:
self.search()
self.driver.close()
def getMemoryUsage(self):
"Return the MB of ram being used by chrome."
process_list = []
total_mem = 0
for p in psutil.process_iter(['name']):
if p.info['name'] == "chrome.exe":
process_list.append(p.pid)
#Calculate total memory usage
for pid in process_list:
try:
#logger.info(str(pid)+" = "+str(psutil.Process(pid).memory_info().private/1000000))
total_mem += psutil.Process(pid).memory_info().private
except psutil.NoSuchProcess:
#logger.info("Process "+str(pid)+" not present")
pass
return total_mem/1000000
def search(self):
while True:
try:
print("Scanning for: "+str(self.item))
self.driver.find_element_by_xpath('//div[some xpath to a button]').click()
print("sending email")
url = self.driver.current_url
sendMail.sendNotification(receiver_email="yourmail.com", url=url)
break
except SE.NoSuchElementException:
print("Refreshing")
self.driver.refresh()
print(dt.datetime.now())
self.wait.until(EC.visibility_of_element_located((By.XPATH,'//div[some other xpath]')))
memUsage = self.getMemoryUsage()
print("Current Memory Usage at: "+str(memUsage)+"MB")
if memUsage > 7000:
#print("Memory Usage reached " +str(memUsage) +"MB. Restarting driver")
logger.info("Memory Usage reached " +str(memUsage) +"MB. Restarting driver")
self.driver.quit()
self.run()
self.done = True
if __name__ == '__main__':
url1 = "https://www.somesite.com"
url2= "https://www.someothersite.com
searchItems = [url1, url2]
print("Starting search")
for item in searchItems:
print(item)
itemSearch(item)
I'm trying to test out chromedriver and headless chrome on this site.
https://car.gocompare.com/vehicle
However when i try with normal chrome it works fine, I'll get a response for a car reg I've put in.
When I use headless chrome it says car cannot be found.
Does anyone know what could be up with it, is it the driver, or the website that is not producing the results back, it seems to work with firefox, so its a little strange.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
# Ability to run headless
from selenium.webdriver.firefox.options import Options as f_Options
from selenium.webdriver.chrome.options import Options as c_Options
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
# This allows you to download the page
from parsel import Selector
import time
import datetime
import os
class headlessbypass:
my_date_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
def my_set_up(self):
"""Executed before running, i.e. opening browser"""
# This is required for running on the pipeline
headless = os.getenv('HEADLESS_MODE')
def firefox_headless_func():
self.options = f_Options()
self.options.headless = True
binary = FirefoxBinary('c:/Users/Anish/AppData/Local/Mozilla Firefox/firefox.exe')
self.driver = webdriver.Firefox(firefox_binary=binary, executable_path='bin/geckodriver.exe', options=self.options)#, options=self.options, executable_path='bin/geckodriver.exe')
def chrome_headless_func():
self.options = c_Options()
#self.options.headless = True
self.options.add_argument("--window-size=1920, 1080")
#self.options.add_argument("--disable-extensions")
#self.options.add_argument("--proxy-server='direct://'")
#self.options.add_argument("--proxy-bypass-list=*")
#self.options.add_argument("--start-maximized")
self.options.add_argument('--headless')
self.options.add_argument('--disable-gpu')
#self.options.add_argument('--disable-dev-shm-usage')
#self.options.add_argument('--no-sandbox')
#self.options.add_argument('--ignore-certificate-errors')
#self.options.add_argument("--allow-insecure-localhost")
#self.options.add_argument("--allow-running-insecure-content")
#self.options.add_argument('--disable-browser-side-navigation')
self.options.add_argument("--enable-javascript")
self.options.add_argument("--user-agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) Gecko/20100101 Firefox/72.0'")
#self.options.binary_location = "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe"
self.driver = webdriver.Chrome(options=self.options, executable_path='bin/chromedriver')
# This is for running locally; select/toggle what you want to run
headless_firefox = 0
headless_chrome = 0
chrome = 1
safari = 0
if headless:
firefox_headless_func()
else:
if headless_firefox:
firefox_headless_func()
elif headless_chrome:
chrome_headless_func()
elif chrome:
self.driver = webdriver.Chrome(executable_path='bin/chromedriver.exe')
else:
self.driver = webdriver.Firefox(executable_path='bin/geckodriver.exe')
self.driver.implicitly_wait(30)
self.driver.maximize_window()
main_window = self.driver.current_window_handle
self.driver.switch_to.window(main_window)
def my_tear_down(self):
"""Executed after running, i.e. closing browser"""
self.driver.quit()
def my_decorator(func):
"""my_set_up and my_tear_down decorator, so that my_set_up is run before and my_tear_down is run after"""
def wrapper(self, *args, **kwargs):
self.my_set_up()
func(self, *args, **kwargs)
self.my_tear_down()
return wrapper
#my_decorator
def visit_site(self):
"""Extract quotes"""
self.driver.get("https://mygocompare.gocompare.com/newcustomer/")
time.sleep(2)
print(self.driver.page_source)
# Enter registration number
reg_field = self.driver.find_element(By.XPATH, "//fieldset[1]/div[2]/div[2]/div/input")
reg_field.send_keys("AK47")
time.sleep(5)
print("Take screenshot")
html = self.driver.find_element_by_tag_name('html')
html.send_keys(Keys.PAGE_UP)
self.driver.save_screenshot("csv_json_files/firstpagescreenshot.png")
self.driver.find_element(By.XPATH, "//span[contains(text(), 'Find car')]").click()
time.sleep(2)
print("Take screenshot")
html = self.driver.find_element_by_tag_name('html')
html.send_keys(Keys.PAGE_UP)
self.driver.save_screenshot("csv_json_files/firstpagescreenshot2.png")
if __name__ == '__main__':
start_time = time.time()
scrape = headlessbypass()
scrape.visit_site()
I have a web page I have been developing and I am now stuck getting told syntax is wrong, I cannot find the error can someone please help me find my error? Here is my code for the database that is giving me my error:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from models import *
import os # File I/O
import time
import shutil
import glob
import configparser
config_parser = configparser.ConfigParser()
config_parser.read("config.ini")
pdownload_dir = os.path.abspath('./prism_downloads/')
dt = str(datetime.datetime.now())
filelist = glob.glob(download_dir + '/*.html')
dbpath = ('./db')
def db_Prism():
database.connect()
database.create_tables([Prism], safe=False)
database.close()
for root, dir, files in os.walk(pdownload_dir):
for file in files:
print(file)
file_markup = ''
with open(os.path.abspath(os.path.join(pdownload_dir, file)), 'r') as html:
file_markup = html.read()
if file_markup == '':
print('ERROR: File was not read')
print('Reading {0} into BS4'.format(file))
soup = BeautifulSoup(file_markup, 'html.parser')
print('File parsed')
data = []
table = soup.find('table')
rows = table.find_all('tr') # 18th row is header row
cols = rows[0].find_all('td')
cols = [ele.text.strip() for ele in cols]
database.connect()
for row in rows[0:]:
d = row.find_all('td')
d = [ele.text.strip() for ele in d]
data.append([ele for ele in d if ele]) # Get rid of empty values
Prism.create(pmt_id=(d[1]),
old_status=d[3],
new_status=(d[4]),
last_updated=float(d[5])
Line 96 database.close()
Now here is the error message from my console:
C:\Users\Documents\NetBeansProjects\BudgetHome>python prism.py
File "prism.py", line 96
database.close()
^
SyntaxError: invalid syntax
C:\Users\Documents\NetBeansProjects\BudgetHome>