selenium+argparse - change browser with command line argument - python-3.x

I want to create a selenium script and use argparse to choose the browser from the command line. This is what I have - when I run test.py chrome, nothing happens.
test.py:
https://repl.it/repls/ProbableHeavenlySystem
from selenium import webdriver
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('chrome')
parser.add_argument('firefox')
parser.parse_args()
args = parser.parse_args()
def pick_browser(args):
if args.chrome == 'chrome':
return args.webdriver.Chrome(executable_path='C:/pathto/chromedriver.exe')
elif args.firefox == 'firefox':
return args.webdriver.Firefox(
executable_path='C:/pathto/geckodriver.exe')
if __name__ == '__main__':
main()
Thanks for your help !

You can use sys.argv:
import sys
print(sys.argv)
in command line type python script.py chrome and in script file:
import sys
print(sys.argv[1]) # prints chrome
Here you can find a good tutorial.

Related

Python: Getting WebContentsDelegate::CheckMediaAccessPermission: Not supported. using fastAPI

Why am I getting this? How do I give permission for my server to access it?
Every time I run:
action = browser.find_element_by_xpath('/html/body/div[1]/header[1]/div/div[1]/div[2]/a[1]')
action.click()
Result in: [1128/225708.842:ERROR:web_contents_delegate.cc(239)] WebContentsDelegate::CheckMediaAccessPermission: Not supported.
I am using uvicorn to run the server locally.
This file is the entry point for the server: main.py
from fastapi import FastAPI
import TrelloBoard
app = FastAPI()
#app.get("/test")
def read_item():
return TrelloBoard.trello_board_main()
This File is the logic to run selenium to go to the Trello website: TrelloBoard.py
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as Options_firefox
from selenium.webdriver.chrome.options import Options as Options_chrome
from selenium.common.exceptions import NoSuchElementException
import time
from datetime import datetime
import linecache
import traceback
import os
from os.path import join, dirname
from dotenv import load_dotenv
script_config_file = "trello-boards-info"
log_file = "trello-scraping.log"
dotenv_path = join(dirname(__file__), '.env')
load_dotenv(dotenv_path)
def log(print_this):
# print to console
print(print_this)
# print to file
print(print_this, file=open(log_file, "a"))
pass
def setup():
global browser
log("timestamp: %s" % datetime.now())
# first line in file contains config for script
settings = read_data_from_file(script_config_file, 1).split(';')
# get driver and headless setting, make reading all lower case to avoid caps issues
driver_setting = settings[0].lower().split('driver=')[1]
headless_setting = settings[1].lower().split('headless=')[1]
# Configure firefox
if driver_setting == 'firefox':
DRIVER_PATH = './geckodriver.exe'
firefox_options = Options_firefox()
if headless_setting == 'true':
firefox_options.headless = True
else:
firefox_options.headless = False
browser = webdriver.Firefox(executable_path=DRIVER_PATH, options=firefox_options)
# Configure chrome
elif driver_setting == "chrome":
DRIVER_PATH = './chromedriver.exe'
chrome_options = Options_chrome()
if headless_setting == 'true':
chrome_options.add_argument("--headless")
# need to add this otherwise will occassionally get error 'element not interactable'
chrome_options.add_argument("--window-size=1920,1080")
else:
chrome_options.add_argument("--None")
browser = webdriver.Chrome(executable_path=DRIVER_PATH, options=chrome_options)
else:
driver_setting = "unrecognised driver"
log("Driver = %s, Headless mode = %s" % (driver_setting, headless_setting))
pass
def trello2():
browser.get('https://trello.com')
log("Go to site: %s" % browser.title)
# read file for login info
my_email = os.environ.get('TRELLO_BOARD_USERNAME')
my_pass = os.environ.get('TRELLO_BOARD_PASSWORD')
log("from file: my_email: %s, my_pass: ***" % my_email)
# find login
action = browser.find_element_by_xpath('/html/body/div[1]/header[1]/div/div[1]/div[2]/a[1]')
action.click()
browser.close()
return "trello"
def trello_board_main():
try:
log("--- start program ---")
setup()
jsonData = trello2()
return jsonData
except Exception as print_error:
# TODO: how to combine print to file and stderr?
traceback.print_exception(type(print_error), print_error, print_error.__traceback__, file=open("trello-scraping.log", "a"))
traceback.print_exception(type(print_error), print_error, print_error.__traceback__)
return print_error
This File is the config for the chromdrive: trello-board-info
driver=chrome;headless=true

customize argparse with -help support

I would like to enable -help option to user. I found couple of example but did not any concrete direction.
import argparse
import os
import sys
def parse_args(args):
parser = argparse.ArgumentParser()
parser.add_argument("-upf", dest="upf", help="provide upf file full path")
args = parser.parse_args()
return args
def main(argv):
args = parse_args(argv)
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
python test.py -help
"error: argument -h/--help: ignored explicit argument 'elp'

how to pytest a pyqt application that uses argparse for input parameters

I have a PyQt application that uses argparse to pass some argument.
I managed to write a simple test to see if the app starts
but I cannot set/mock the argparse arguments
I know it because inside the code I have some try/except like this
try:
if args.erase_data:
pass
except NameError:
logger.error("Error in parsing erase_data input argument \n")
that during the tests fail, while they do not fail if I run the app.
I tried this to mock args
import os
import pathlib
# import pdb
import sys
from unittest import mock
import pytest
from PyQt5 import QtTest
from PyQt5.QtWidgets import *
from pytestqt.plugin import QtBot
sys.path.append(os.getcwd())
src_dir = pathlib.Path(__file__).parents[1].absolute()
print(src_dir)
sys.path.append(src_dir.as_posix())
GUI = __import__("GUI")
#pytest.fixture(scope="module")
def qtbot_session(qapp, request):
result = QtBot(qapp)
with capture_exceptions() as e:
print(getattr(e, "message", repr(e)))
yield result
print(" TEARDOWN qtbot")
#pytest.fixture(scope="module")
def Viewer(request):
with mock.patch.object(sys, "argv", ["",'-d','2']):
print("mocking sys argv")
print(sys.argv)
# pdb.set_trace()
app, app_window = GUI.main()
qtbotbis = QtBot(app)
QtTest.QTest.qWait(0.5 * 1000)
assert app_window.isVisible()
return app, app_window, qtbotbis
but args is still not set.
any idea how to solve it?

Capture a terminal output in real time of an python module

The python 'yfinance' module downloads the quotes of many Financial Securities in a pandas dataframe and in the meanwhile it displays a progress bar in the console. In this way:
import yfinance as yf
Tickerlist = ["AAPL","GOOG","MSFT"]
quote = yf.download(tickers=Tickerlist,period='max',interval='1d',group_by='ticker')
I would like to capture the console progress bar in real time, and the code should be this:
import sys
import subprocesss
process = subprocess.Popen(["yf.download","tickers=Tickerlist","period='max'","interval='1d'","group_by='ticker'"],stdout=quote)
while True:
out = process.stdout.read(1)
sys.stdout.write(out)
sys.stdout.flush()
I make a big mess with subprocess. I need your help! Thanks.
I have already seen all the links that deal with this topic but without being able to solve my problem.
You need two python files to do what you want.
one is yf_download.py and second is run.py
The file code looks like this and you can run it through run.py
python run.py
yf_download.py
import sys
import yfinance as yf
Tickerlist = ["AAPL","GOOG","MSFT"]
def run(period):
yf.download(tickers=Tickerlist, period=period,interval='1d',group_by='ticker')
if __name__ == '__main__':
period = sys.argv[1]
run(period)
run.py
import sys
import subprocess
process = subprocess.Popen(["python", "yf_download.py", "max"],stdout=subprocess.PIPE)
while True:
out = process.stdout.read(1)
if process.poll() is not None:
break
if out != '':
sys.stdout.buffer.write(out)
sys.stdout.flush()

How to quit selenium chrome driver when the driver is not selected? [duplicate]

This question already has answers here:
Selenium : How to stop geckodriver process impacting PC memory, without calling driver.quit()?
(1 answer)
PhantomJS web driver stays in memory
(1 answer)
Closed 3 years ago.
I wrote some code in python using selenium and multiprocessing to parallelize data collection. I am collecting some data from YouTube. I have a method which initiates a chrome webdriver. I used multiprocessing to collect data faster. The issue is that when the timeout for the multiprocessing is reached, the function with the chromedriver exits the function before driver.quit() command can register. This leads to the accumulation of idle chromedrivers which I cannot close within python since (to my knowledge) there is no way to reference them. Is there any way to close all chromedrivers without explicitly using the driver objects?
I wrote the code in python3. The chromedriver is Chrome version 72.
# Web related modules
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from urllib.request import urlopen
import html2text
# YouTube download module
from pytube import YouTube
# Multiprocessing tools
from multiprocessing import Lock, Manager, Queue, Pool
import multiprocessing as mp
# Misc modules
import time, re, pickle, os, shutil, argparse, glob, unicodedata, datetime
from argparse import RawTextHelpFormatter
# Irrelevant to the problem
def save_vids(vid_ids,save_loc):
print('Irrelevant Function')
# Function that generates the initial list of urls to visit
def explore_home(chromedriver_path,chrome_options,caps):
driver=webdriver.Chrome(executable_path=chromedriver_path,options=chrome_options,desired_capabilities=caps)
driver.get('https://www.youtube.com')
time.sleep(1)
html_source = driver.page_source
driver.close()
parts=html_source.split('{"webCommandMetadata":{"url":"/watch_videos?')[1:]
vids=[]
for part in parts:
part=part[part.find('video_ids=')+10:]
if part.find('\\u')!=-1:
if part.find('"')!=-1:
end=min(part.find('\\u'),part.find('"'))
else:
end=part.find('\\u')
elif part.find('"')!=-1:
end=part.find('"')
else:
print('fuck')
concat_list=part[:end]
vids.extend(concat_list.split('%2C'))
vids=[vid for vid in vids if len(re.findall(r'[0-9]|[a-z]|[A-Z]|_|-',vid))==11 and len(vid)==11]
return vids
# The function that generates chromedrivers and fails to quit if a multiprocessing timeout occurs.
def explore_vid(chromedriver_path,chrome_options,caps,vid,ads,save_loc,l):
driver=webdriver.Chrome(executable_path=chromedriver_path,options=chrome_options,desired_capabilities=caps)
driver.get('https://www.youtube.com/watch?v='+vid)
time.sleep(2)
sec_html = driver.page_source
soup=BeautifulSoup(sec_html,'lxml')
mydivs = str(soup.findAll("div", {"class": "style-scope ytd-watch-next-secondary-results-renderer"}))
inds=[m.start() for m in re.finditer('ytimg.com/vi/', mydivs)]
rec_vids=['https://www.youtube.com/watch?v='+mydivs[ind+13:ind+24] for ind in inds]
browser_log = driver.get_log('performance')
adInfo=find_ad(browser_log,vid)
if adInfo:
#Check if it is the first time this ad has been seen
adID=adInfo[0]
l.acquire()
try:
if adID in ads:
ads[adID][0].append(adInfo[1])
else:
try:
element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".ytp-ad-button.ytp-ad-visit-advertiser-button.ytp-ad-button-link")))
element.click()
driver.switch_to.window(driver.window_handles[-1])
ad_website_URL=driver.current_url
ad_website_HTML=driver.page_source
clean_text=html2text.html2text(ad_website_HTML)
save_vids(adID,save_loc)
textName=os.path.join(save_loc,adID,'adwebsite.txt')
file = open(textName,"w")
file.write(ad_website_URL)
file.write('\n')
file.write(clean_text)
file.close()
ads[adID]=[[adInfo[1]],ad_website_URL]
except WebDriverException:
print('Button click failed: %s:%s' %(vid,adInfo[0]))
finally:
l.release()
# The quit command for the chrome driver
driver.quit()
return rec_vids
def find_ad(browser_log,vid):
for k in range(len(browser_log)):
if browser_log[k]['message'].find('adunit')!=-1 and browser_log[k]['message'].find(vid)!=-1:
ind=browser_log[k]['message'].find('https://www.youtube.com/get_video_info?html5=1&video_id=')
vid_id=browser_log[k]['message'][ind+56:ind+67]
return (vid_id,time.localtime())
return None
def positive_int(argument):
num=int(argument)
if num<1:
msg="Maximum depth parameter must be a positive number. You entered: %s" %argument
raise argparse.ArgumentTypeError(msg)
return num
def valid_pickle(argument):
file=str(argument)
if not file.endswith('.pickle'):
msg="ad_save_loc must end with .pickle You entered: %s" %file
raise argparse.ArgumentTypeError(msg)
return file
def valid_dir(argument):
directory=str(argument)
if not os.path.isdir(directory):
msg="vid_save_loc must be a valid directory. You entered: %s" %directory
raise argparse.ArgumentTypeError(msg)
return directory
if __name__ == '__main__':
# Argument Parsing
parser = argparse.ArgumentParser(description='Scrapes Youtube ads and advertising company websites. \nUse --restart to restart the scraping from scratch by deleting previous data\nExample Usage: python finalReader.py E:\ads\ads.pickle E:\ads --ncpu 2', formatter_class=RawTextHelpFormatter)
parser.add_argument('ad_save_loc',help='Save Location for Ad Main Dictionary', type=valid_pickle)
parser.add_argument('vid_save_loc',help='Save Location for Ad Videos', type=valid_dir)
parser.add_argument('chromedriver_path', help='Path of the chrome executable', type=str)
parser.add_argument('--restart', help='Restart collection', action="store_true", default=False, dest='restartCollection')
parser.add_argument('--ncpu', nargs='?', help='Number of cores for multiprocessing, 1 by default', default=1, type=int, dest='mpcpu')
parser.add_argument('--timeout',nargs='?', help='For how long the data collection will take place (in seconds), infinite by default', default=float('inf'), type=float, dest='time_limit')
parser.add_argument('--max_depth', nargs='?', help='Depth of Youtube exploration tree', default=1, type=positive_int, dest='search_depth')
args = parser.parse_args()
ad_save_loc=args.ad_save_loc
vid_save_loc=args.vid_save_loc
vid_save_loc=os.path.join(vid_save_loc,'ad_data')
mpcpu=max(args.mpcpu,1)
time_limit=args.time_limit
chromedriver_path=args.chromedriver_path
search_depth=args.search_depth
if not os.path.isdir(vid_save_loc):
os.mkdir(vid_save_loc)
if args.restartCollection:
for the_file in os.listdir(vid_save_loc):
file_path = os.path.join(vid_save_loc, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(e)
if os.path.isfile(ad_save_loc):
os.remove(ad_save_loc)
ads={}
else:
if os.path.isfile(ad_save_loc):
pickle_in = open(ad_save_loc,"rb")
ads = pickle.load(pickle_in)
else:
ads={}
# Chrome Driver Options
chrome_options=Options()
chrome_options.add_argument('--mute-audio')
caps = DesiredCapabilities.CHROME
caps['loggingPrefs'] = {'performance': 'ALL'}
startTime=time.time()
currentTime=time.time()
# Data Collection Loop - Multiprocessing
while currentTime-startTime<time_limit:
print('Time from start: %s' %str(datetime.timedelta(seconds=currentTime-startTime)))
rec_vids=explore_home(chromedriver_path,chrome_options,caps)
while not rec_vids:
time.sleep(60)
rec_vids=explore_home(chromedriver_path,chrome_options,caps)
m = Manager()
lock = m.Lock()
pool = Pool(processes=mpcpu)
for depth in range(search_depth):
print('Depth %s' %depth)
multiple_results=[pool.apply_async(explore_vid, (chromedriver_path,chrome_options,caps,vid,ads,vid_save_loc,lock)) for vid in rec_vids]
branching_vids=[]
for res in multiple_results:
try:
branching_vids.append(res.get(timeout=30))
if time.time()-startTime<time_limit:
break
except mp.TimeoutError:
print('Timeout')
res_vids=branching_vids.copy()
pickle_out = open(ad_save_loc,"wb")
pickle.dump(ads, pickle_out)
pickle_out.close()
currentTime=time.time()

Resources