Is selenium thread safe for scraping with Python?

Is selenium thread safe for scraping with Python? - multithreading

I am executing a Python script with Threading, where given a "query" term that I put in the Queue, I create the url with the query parameters, set the cookies & parse the webpage to return the Products & the urls of those products. Here's the script.
Task : For a given set of queries, store the top 20 product ids in a file, or lower # if the query returns fewer results.
I remember reading that Selenium is not thread safe. Just want to make sure that this problem occurs because of that limitation, and is there a way to make it work in concurrent threads ? The main problem is that the script was I/O bound, so very slow for scraping about 3000 url fetches.
from pyvirtualdisplay import Display
from data_mining.scraping import scraping_conf as sf #custom file with rules for scraping
import Queue
import threading
import urllib2
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
num_threads=5
COOKIES=sf.__MERCHANT_PARAMS[merchant_domain]['COOKIES']
query_args =sf.__MERCHANT_PARAMS[merchant_domain]['QUERY_ARGS']
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def url_from_query(self,query):
for key,val in query_args.items():
if query_args[key]=='query' :
query_args[key]=query
print "query", query
try :
url = base_url+urllib.urlencode(query_args)
print "url"
return url
except Exception as e:
log()
return None
def init_driver_and_scrape(self,base_url,query,url):
# Will use Pyvirtual display later
#display = Display(visible=0, size=(1024, 768))
#display.start()
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList",2)
fp.set_preference("javascript.enabled", True)
driver = webdriver.Firefox(firefox_profile=fp)
driver.delete_all_cookies()
driver.get(base_url)
for key,val in COOKIES[exp].items():
driver.add_cookie({'name':key,'value':val,'path':'/','domain': merchant_domain,'secure':False,'expiry':None})
print "printing cookie name & value"
for cookie in driver.get_cookies():
if cookie['name'] in COOKIES[exp].keys():
print cookie['name'],"-->", cookie['value']
driver.get(base_url+'search=junk') # To counter any refresh issues
driver.implicitly_wait(20)
driver.execute_script("window.scrollTo(0, 2000)")
print "url inside scrape", url
if url is not None :
flag = True
i=-1
row_data,row_res=(),()
while flag :
i=i+1
try :
driver.get(url)
key=sf.__MERCHANT_PARAMS[merchant_domain]['GET_ITEM_BY_ID']+str(i)
print key
item=driver.find_element_by_id(key)
href=item.get_attribute("href")
prod_id=eval(sf.__MERCHANT_PARAMS[merchant_domain]['PRODUCTID_EVAL_FUNC'])
row_res=row_res+(prod_id,)
print url,row_res
except Exception as e:
log()
flag =False
driver.delete_all_cookies()
driver.close()
return query+"|"+str(row_res)+"\n" # row_data, row_res
else :
return [query+"|"+"None"]+"\n"
def run(self):
while True:
#grabs host from queue
query = self.queue.get()
url=self.url_from_query(query)
print "query, url", query, url
data=self.init_driver_and_scrape(base_url,query,url)
self.out_queue.put(data)
#signals to queue job is done
self.queue.task_done()
class DatamineThread(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
data = self.out_queue.get()
fh.write(str(data)+"\n")
#signals to queue job is done
self.out_queue.task_done()
start = time.time()
def log():
logging_hndl=logging.getLogger("get_results_url")
logging_hndl.exception("Stacktrace from "+"get_results_url")
df=pd.read_csv(fh_query, sep='|',skiprows=0,header=0,usecols=None,error_bad_lines=False) # read all queries
query_list=list(df['query'].values)[0:3]
def main():
exp="Control"
#spawn a pool of threads, and pass them queue instance
for i in range(num_threads):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
#populate queue with data
print query_list
for query in query_list:
queue.put(query)
for i in range(num_threads):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
#wait on the queue until everything has been processed
queue.join()
out_queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
While I should be getting, all search results from each url page, I get only the 1st , i=0 search card and this doesn't execute for all queries/urls. What am I doing wrong ?
What I expect -
url inside scrape http://<masked>/search=nike+costume
searchResultsItem0
url inside scrape http://<masked>/search=red+tops
searchResultsItem0
url inside scrape http://<masked>/search=halloween+costumes
searchResultsItem0
and more searchResultsItem(s) , like searchResultsItem1,searchResultsItem2 and so on..
What I get
url inside scrape http://<masked>/search=nike+costume
searchResultsItem0
url inside scrape http://<masked>/search=nike+costume
searchResultsItem0
url inside scrape http://<masked>/search=nike+costume
searchResultsItem0
The skeleton code was taken from
http://www.ibm.com/developerworks/aix/library/au-threadingpython/
Additionally when I use Pyvirtual display, will that work with Threading as well ? I also used processes with the same Selenium code, and it gave the same error. Essentially it opens up 3 Firefox browsers, with exact urls, while it should be opening them from different items in the queue. Here I stored the rules in file that will import as sf, which has all custom attributes of a Base Domain.
Since setting the cookies is an integral part of my script, I can't use dryscrape.
EDIT :
I tried to localize the error, and here's what I found -
In the custom rules file, I call "sf" above, I had defined, QUERY_ARGS as
__MERCHANT_PARAMS = {
"some_domain.com" :
{
COOKIES: { <a dict of dict, masked here>
},
... more such rules
QUERY_ARGS:{'search':'query'}
}
So what is really happening is , that on calling,
query_args =sf.__MERCHANT_PARAMS[merchant_domain]['QUERY_ARGS'] - this should return the dict
{'search':'query'}, while it returns,
AttributeError: 'module' object has no attribute
'_ThreadUrl__MERCHANT_PARAMS'
This is where I don't understand how the thread is passing '_ThreadUrl__' I also tried re-initializing the query_args,inside the url_from_query method, but this doesn't work.
Any pointers, on what am I doing wrong ?

I may be replying pretty late to this. However, I tested it python2.7 and both options multithreading and mutliprocess works with selenium and it is opening two separate browsers.

Related

How to access a variable as a parameter from within a function from another module without running the function?

I don't know how to describe it any less confusing than in the title so I hope i can make it clear through the Code. It's not all, only the parts describing what i want ot do and what i do not how in order not to bloat.
testing_lib.py has the following function in it:
def browserconfig():
if browser == 'Chrome':
options = webdriver.ChromeOptions()
options.add_argument('ignore-certificate-errors')
else:
raise Exception
logp("SSL ", "passed")
# Driver und Seitenaufruf
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
driver.maximize_window()
return driver
def wait(driver):
WebDriverWait(driver, 180, poll_frequency=1).until(ajax_fin, "Wait timeout")
testing_base.py has a function which (among others) calls browserconfig():
import testing_lib
def start():
testing_lib.browserconfig()
and now, the last file, testing_main.py calls the start() function from testing_base.py but also the wait(driver) function from testing_lib.py which needs the driver variable from browserconfig() as a parameter in order to work.
import testing_lib
import testing_base
from testing_lib import wait
testing_base.start()
wait(driver)
My main question is: How do i get "driver" as a parameter into wait(driver) in testing_main.py without running "browserconfig()" there, since it's only supposed to run once.

web scraping data from glassdoor using selenium

please I need some help to run this code (https://github.com/PlayingNumbers/ds_salary_proj/blob/master/glassdoor_scraper.py)
In order to scrape job offer data from Glassdoor
Here's the code snippet:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd
options = webdriver.ChromeOptions()
#Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#options.add_argument('headless')
#Change the path to where chromedriver is in your home folder.
driver = webdriver.Chrome(executable_path=path, options=options)
driver.set_window_size(1120, 1000)
url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+'data scientist'+"&sc.keyword="+'data scientist'+"&locT=&locId=&jobType="
#url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
driver.get(url)
#Let the page load. Change this number based on your internet speed.
#Or, wait until the webpage is loaded, instead of hardcoding it.
time.sleep(5)
#Test for the "Sign Up" prompt and get rid of it.
try:
driver.find_element_by_class_name("selected").click()
except NoSuchElementException:
pass
time.sleep(.1)
try:
driver.find_element_by_css_selector('[alt="Close"]').click() #clicking to the X.
print(' x out worked')
except NoSuchElementException:
print(' x out failed')
pass
#Going through each job in this page
job_buttons = driver.find_elements_by_class_name("jl")
I'm getting an empty list
job_buttons
[]

Your problem is with wrong except argument.
With driver.find_element_by_class_name("selected").click() you are trying to click non-existing element. There is no element matching "selected" class name on that page. This causes NoSuchElementException exception as you can see yourself while you are trying to catch ElementClickInterceptedException exception.
To fix this you should use the correct locator or at least the correct argument in except.
Like this:
try:
driver.find_element_by_class_name("selected").click()
except NoSuchElementException:
pass
Or even
try:
driver.find_element_by_class_name("selected").click()
except:
pass
I'm not sure what elements do you want to get into job_buttons.
The search results containing all the details per each job can be found by this:
job_buttons = driver.find_elements_by_css_selector("li.react-job-listing")

Stuck in loop <> Code doesn't want to pull anything except row 1

I am stuck in loop, I don't know what to change to make my code work normally...
problem is with CSV file, my file contains list of domains (freedommortgage.com, google.com, amd.com etc.) so when I run code, everything is fine at start, but then it keeps sending me same results all over:
the monthly total visits to freedommortgage.com is 1.10M
So here is my line:
import csv
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib
from captcha2upload import CaptchaUpload
import time
# setting the firefox driver
def init_driver():
driver = webdriver.Firefox(executable_path=r'C:\Users\muki\Desktop\similarweb_scrapper-master\geckodriver.exe')
driver.implicitly_wait(10)
return driver
# solving the captcha (with 2captcha.com)
def captcha_solver(driver):
captcha_src = driver.find_element_by_id('recaptcha_challenge_image').get_attribute("src")
urllib.urlretrieve(captcha_src, "captcha.jpg")
captcha = CaptchaUpload("4cfd308fd703d40291a7e250d743ca84") # 2captcha API KEY
captcha_answer = captcha.solve("captcha.jpg")
wait = WebDriverWait(driver, 10)
captcha_input_box = wait.until(
EC.presence_of_element_located((By.ID, "recaptcha_response_field")))
captcha_input_box.send_keys(captcha_answer)
driver.implicitly_wait(10)
captcha_input_box.submit()
# inputting the domain in similar web search box and finding necessary values
def lookup(driver, domain, short_method):
# short method - inputting the domain in the url
if short_method:
driver.get("https://www.similarweb.com/website/" + domain)
else:
driver.get("https://www.similarweb.com")
attempt = 0
# trying 3 times before quiting (due to second refresh by the website that clears the search box)
while attempt < 1:
try:
captcha_body_page = driver.find_elements_by_class_name("block-page")
driver.implicitly_wait(10)
if captcha_body_page:
print("Captcha ahead, solving the captcha, it may take a few seconds")
captcha_solver(driver)
print("Captcha solved! the program will continue shortly")
time.sleep(20) # to prevent second refresh affecting the upcoming elements finding after captcha solved
# for normal method, inputting the domain in the searchbox instead of url
if not short_method:
input_element = driver.find_element_by_id("js-swSearch-input")
input_element.click()
input_element.send_keys(domain)
input_element.submit()
wait = WebDriverWait(driver, 10)
time.sleep(10)
total_visits = wait.until(
EC.presence_of_element_located((By.XPATH, "//span[#class='engagementInfo-valueNumber js-countValue']")))
total_visits_line = "the monthly total visits to %s is %s" % (domain, total_visits.text)
time.sleep(10)
print('\n' + total_visits_line)
except TimeoutException:
print("Box or Button or Element not found in similarweb while checking %s" % domain)
attempt += 1
print("attempt number %d... trying again" % attempt)
# main
if __name__ == "__main__":
with open('bigdomains.csv', 'rt') as f:
reader = csv.reader(f)
driver = init_driver()
for row in reader:
domain = row[0]
lookup(driver, domain, True) # user need to give as a parameter True or False, True will activate the
# short method, False will take the normal method
(Sorry for the long line of code, but I have to present everything, even tho focus is on the LAST PART of the code)
My question is simple:
Why does it keep taking row number 1 domain, and ignoring the row2 row3 row4, etc...?
Time = delay has to be 10, or more, to avoid captcha issue on this website
if anyone would try to run this, you have to edit name of csv file, and to have few domains in it in format google.com (not www.google.com) of course.

Looks like you're always accessing the same index everytime with:
domain = row[0]
Index 0 is the first item, hence why you keep getting the same value.
This post explains an alternative way to use a for loop in Python.
Accessing the index in 'for' loops?

Tornado HTTPServer that adds objects to a queue upon receiving a POST request

I want to create a web server, that automatically handles "orders" when receiving a POST request.
My code so far looks like this:
from json import loads
from tornado.httpserver import HTTPServer
from tornado.ioloop import IOLoop
from tornado.web import Application, url, RequestHandler
order_list = list()
class MainHandler(RequestHandler):
def get(self):
pass
def post(self):
if self.__authenticate_incoming_request():
payload = loads(self.request.body)
order_list.append(payload)
else:
pass
def __authenticate_incoming_request(self) -> bool:
# some request authentication code
return True
def start_server():
application = Application([
url(r"/", MainHandler)
])
server = HTTPServer(application)
server.listen(8080)
IOLoop.current().start()
if __name__ == '__main__':
start_server()
Here is what I want to achieve:
Receive a POST request with information about incoming "orders"
Perform an action A n-times based on a value defined in the request.body(concurrently, if possible)
Previously, to perform action A n-times, I have used a threading.ThreadPoolExecutor, but I am not sure, how I should handle this correctly with a web server running in parallel.
My idea was something like this:
start_server()
tpe = ThreadPoolExecutor(max_workers=10)
while True:
if order_list:
new_order = order_list.pop(0)
tpe.submit(my_action, new_order) # my_action is my desired function
sleep(5)
Now this piece of code is of course blocking, and I was hoping that the web server would continue running in parallel, while I am running my while-loop.
Is a setup like this possible? Do I maybe need to utilize other modules? Any help greatly appreciated!

It's not working as expected because time.sleep is a blocking function.
Instead of using a list and a while loop and sleeping to check for new items in a list, use Tornado's queues.Queue which will allow you to check for new items asynchronously.
from tornado.queues import Queue
order_queue = Queue()
tpe = ThreadPoolExecutor(max_workers=10)
async def queue_consumer():
# The old while-loop is now converted into a coroutine
# and an async for loop is used instead
async for new_order in order_queue:
# run your function in threadpool
IOLoop.current().run_in_executor(tpe, my_action, new_order)
def start_server():
# ...
# run queue_consumer function before starting the loop
IOLoop.current().spawn_callback(queue_consumer)
IOLoop.current().start()
Put items in the queue like this:
order_queue.put(payload)

Selenium: is setUpClass(), tearDownClass(), #classmethod decorator let each tests share a single browser instance?

I'm fresh man for Selenium. And here is my two test files, the first one includes 2 test cases and if run it then it opens only 1 Chrome session for both tests.
The second one includes 3 test cases but it opens 1 Chrome session for each test.
From the book, since I use #classmethod decorator for setUpClass(), tearDownClass to set them as class level, then there should be only 1 Browser session for all tests in a file. Please correct me if my understanding is wrong...
-> the first file(searchtests_with_class_methods.py)
import unittest
from selenium import webdriver
class SearchTest(unittest.TestCase):
#classmethod
def setUpClass(cls):
# create a new Chrome session
cls.driver = webdriver.Chrome()
cls.driver.implicitly_wait(30)
cls.driver.maximize_window()
# navigation to the application home page
cls.driver.get("http://demo-store.seleniumacademy.com/")
# ?don't know why need this title here
cls.driver.title
def test_search_by_category(self):
# get the search textbox
self.search_field = self.driver.find_element_by_name("q")
self.search_field.clear()
# enter search keyword and submit
self.search_field.send_keys("phones")
self.search_field.submit()
# get all the anchor elements which have product name displayed
# currently on result page using find_element_by_xpath method
products = self.driver.find_elements_by_xpath("//h2[#class='product-name']/a")
self.assertEqual(3, len(products))
def test_search_by_name(self):
# get the search textbox
self.search_field = self.driver.find_element_by_name("q")
self.search_field.clear()
# enter search keyword and submit
self.search_field.send_keys("salt shaker")
self.search_field.submit()
# get all the anchor elements which have product name displayed
# currently on result page using find_element_by_xpath method
products = self.driver.find_elements_by_xpath("//h2[#class='product-name']/a")
self.assertEqual(1, len(products))
#classmethod
def tearDownClass(cls):
# close the browser window
cls.driver.quit()
if __name__ == '__main__':
unittest.main(verbosity=2)
-> the second file(homepagetests.py)
import unittest
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from builtins import classmethod
class HomePageTest(unittest.TestCase):
#classmethod
def setUp(cls):
# create a new Chrome session
cls.driver = webdriver.Chrome()
cls.driver.implicitly_wait(30)
cls.driver.maximize_window()
# navigate to the application home page
cls.driver.get("http://demo-store.seleniumacademy.com/")
def test_search_field(self):
# check search field exists on Home page
self.assertTrue(self.is_element_present(By.NAME, "q"))
def test_language_option(self):
# check language options dropdown on Home page
self.assertTrue(self.is_element_present(By.ID, "select-language"))
def test_shopping_cart_empty_message(self):
# check content of My Shopping Cart block on Home page
shopping_cart_icon = self.driver.\
find_element_by_css_selector("div.header-minicart span.icon")
shopping_cart_icon.click()
shopping_cart_status = self.driver.\
find_element_by_css_selector("p.empty").text
self.assertEqual("You have no items in your shopping cart.",
shopping_cart_status)
close_button = self.driver.\
find_element_by_css_selector("div.minicart-wrapper a.close")
close_button.click()
#classmethod
def tearDown(cls):
# close the browser window
cls.driver.quit()
def is_element_present(self, how, what):
"""
Utility method to check presence of an element on page
:params how: By locator type
:params what: locator value
"""
try:
self.driver.find_element(by=how, value=what)
except NoSuchElementException as e:
return False
return True
if __name__ == '__main__':
unittest.main(verbosity = 2)
I'm using Python3.7.1, Selenium '3.141.0' and Chrome 72.0.3626.121 on Mac OS 10.13.6.
Confused on this behavior...could you help?

Today I just found what is the problem, actually there is a typo in second file where 'def tearDown(cls):' should be 'def tearDownClass(cls):' since I'm using #classmethod decorator. What a stupid man I am... Finally all tests are passed with only one browser session.
I didn't delete this question in case some guys meet same issue with me in the future.

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Is selenium thread safe for scraping with Python? - multithreading

I may be replying pretty late to this. However, I tested it python2.7 and both options multithreading and mutliprocess works with selenium and it is opening two separate browsers.

Related

How to access a variable as a parameter from within a function from another module without running the function?

web scraping data from glassdoor using selenium

Stuck in loop <> Code doesn't want to pull anything except row 1

Tornado HTTPServer that adds objects to a queue upon receiving a POST request

Selenium: is setUpClass(), tearDownClass(), #classmethod decorator let each tests share a single browser instance?

Categories

Resources