Unexpected Multithreading Output when Web Scraping with Selenium (Python)

Unexpected Multithreading Output when Web Scraping with Selenium (Python) - python-3.x

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor
import time
# Current time is :48.77885s per Page, 4.4344 per Region
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def list_to_csv(summoner_info):
summoner_info = set([tuple(summoner) for summoner in summoner_info])
with open('high_elo_summoners.csv', 'w', encoding='utf-8') as f:
for summoner in summoner_info:
f.write(f"{summoner[0]},{summoner[1]},{summoner[2]}\n")
def gather_summoner_info(url):
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, 'rt-tr')
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split('\n')
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(' ')[0].replace(',', ''))])
else:
pass
return summoner_info
def get_summoner_data(page_count, regions):
links = [f'https://u.gg/leaderboards/ranking?region={region}&page={page + 1}' for page in range(page_count) for
region in regions]
# Gather all the relevant summoner information on the page
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {url : executor.submit(gather_summoner_info, url) for url in links}
for url, future in future_results.items():
#print(future.result())
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ['na1', 'euw1', 'eun1', 'kr', 'br1', 'jp1', 'ru', 'oc1', 'tr1', 'la1', 'la2']
get_summoner_data(page_count, regions)
if __name__ == '__main__':
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)
Issue: Code is returning the same output for each iteration (The first link of the links list)
Above the following code pulls some information from the links variable using selenium. The issue is that when the threads are executing in the get_summoner_data() function, it is returning the same results every time. I'm not sure what the issue is coming from as the different links will print from each gather_summoner_info() call.
Currently it is just returning the information from the very first link. Not sure what is causing the issue, any help is appreciated.

Approach
Try running without --headless option. You will see what's going on.
Problem
You created only one instance of web driver and that one is being used for all the threaded tasks. Multiple threads try to load different URLs on this single driver and finally it is very likely that the lastly tried URL will be loaded all the time.
Fix
Simple fix is to create a driver instance for every thread.
You can do this by moving the line creating a web driver into the thread task function gather_summoner_info as below. I tried with this fix and it works correctly.
def gather_summoner_info(url):
##### moved ######
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
##################
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, "rt-tr")
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split("\n")
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(" ")[0].replace(",", ""))])
else:
pass
return summoner_info
Further Consideration
As you know, creating a new web driver instance is resource expensive. If you are just trying to scrape information, HTTP requests are enough most of the times.
For the website you are trying to scrape, I found that the job can be done using HTTP requests only. I revised the script without using Selenium and it takes less than 1 second to load all the leaderboards for all regions.
import json
import time
from concurrent.futures import ThreadPoolExecutor
import requests
def list_to_csv(summoner_info):
summoner_info = sorted(summoner_info, key=lambda x: int(x[2]), reverse=True)
with open("result.csv", "w", encoding="utf-8") as f:
f.write("\n".join([",".join(item) for item in summoner_info]))
def gather_summoner_info(region: str):
payload = json.dumps(
{
"operationName": "getRankedLeaderboard",
"variables": {"page": 1, "queueType": 420, "regionId": region},
"query": "query getRankedLeaderboard($page: Int, $queueType: Int, $regionId: String!) {\n leaderboardPage(page: $page, queueType: $queueType, regionId: $regionId) {\n totalPlayerCount\n topPlayerMostPlayedChamp\n players {\n iconId\n losses\n lp\n overallRanking\n rank\n summonerLevel\n summonerName\n tier\n wins\n __typename\n }\n __typename\n }\n}\n",
}
)
headers = {"Content-Type": "application/json"}
response = requests.post("https://u.gg/api", headers=headers, data=payload)
summoner_info = []
data = response.json()
for player in data["data"]["leaderboardPage"]["players"]:
summoner_info.append((player["summonerName"], player["tier"], player["lp"]))
return summoner_info
def get_summoner_data(page_count, regions):
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {r: executor.submit(gather_summoner_info, r) for r in regions}
for _, future in future_results.items():
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ["na1", "euw1", "eun1", "kr", "br1", "jp1", "ru", "oc1", "tr1", "la1", "la2"]
get_summoner_data(page_count, regions)
if __name__ == "__main__":
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)

Related

How to compare between two requests using Thread

I have created a simple thread request code:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
def doScrape(response):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
return {
'url': response.url,
'repository_results': t.text.strip()
}
def doRequest(url):
response = requests.get(url)
time.sleep(random.randint(1, 3))
return response
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url
) for url in URLS]
for future in as_completed(future_tasks):
response = future.result()
if response.status_code == 200:
result = doScrape(response)
print(result)
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
where I first start a thread with a ThreadPoolExecutor that has a workers of 2. The idea of this is that I want to be able to monitor 24/7 and notify myself whenever there has been a change (in this case if the repository_results is different between previous requests vs. latest request) - whenever there is a change, I want to print out that says that there is a difference.
I wonder how I am able to do that using ThreadPoolExecutor and how I can monitor a specific url to see if there has happend a change nor not?

You can do this by storing the previous results in the list itself, and passing that along with the response to doScrape
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
result = {'url': url_dict['url'], 'respository_results': None, 'change': False}
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
result['change'] = True
result['respository_results'] = current_response
return result
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url_dict
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
result = doScrape(response, url_dict)
print(result)
if result['change']:
print(f'Changed for url : {result["url"]}!')
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
The only exception where this fails is if the change happened at the very first time you are running the loop, since we would not know the previous value of the scraped element.
Also, if you are planning to run this on loop and only want to print in case their is a change, make sure to change the repository_result key in the url_dict itself (inside doScrape), and you can omit the return results line as well:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
print(f'Changed for url : {url_dict["url"]}')
url_dict['respository_results'] = current_response
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url_dict
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
doScrape(response, url_dict)
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()

How to make selenium threads run (each thread with its own driver)

I have a python 3 script that needs to make thousands of requests to multiple different websites and check if their source code pass in some pre defined rules.
I am using selenium to make the requests because I need to get the source code after JS finishes it excecution, but due to the high number of urls I need to check, I am trying to make it run multiple threads simultaneously. Each thread creates and maintain an instance of webdriver to make the requests. The problem is after a while all threads go silent and simply stop executing, leaving just a single thread doing all the work. Here is the relevant part of my code:
def get_browser(use_firefox = True):
if use_firefox:
options = FirefoxOptions()
options.headless = True
browser = webdriver.Firefox(options = options)
browser.implicitly_wait(4)
return browser
else:
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.implicitly_wait(4)
return browser
def start_validations(urls, rules, results, thread_id):
try:
log("thread %s started" % thread_id, thread_id)
browser = get_browser(thread_id % 2 == 1)
while not urls.empty():
url = "http://%s" % urls.get()
try:
log("starting %s" % url, thread_id)
browser.get(url)
time.sleep(0.5)
WebDriverWait(browser, 6).until(selenium_wait_reload(4))
html = browser.page_source
result = check_url(html, rules)
original_domain = url.split("://")[1].split("/")[0].replace("www.","")
tested_domain = browser.current_url.split("://")[1].split("/")[0].replace("www.","")
redirected_url = "" if tested_domain == original_domain else browser.current_url
results.append({"Category":result, "URL":url, "Redirected":redirected_url})
log("finished %s" % url, thread_id)
except Exception as e:
log("couldn't test url %s" % url, thread_id )
log(str(e), thread_id)
results.append({"Category":"Connection Error", "URL":url, "Redirected":""})
browser.quit()
time.sleep(2)
browser = get_browser(thread_id % 2 == 1)
except Exception as e:
log(str(e), thread_id)
finally:
log("closing thread", thread_id)
browser.quit()
def calculate_progress(urls):
progress_folder ="%sprogress/" % WEBROOT
if not os.path.exists(progress_folder):
os.makedirs(progress_folder)
initial_size = urls.qsize()
while not urls.empty():
current_size = urls.qsize()
on_queue = initial_size - current_size
progress = '{0:.0f}'.format((on_queue / initial_size * 100))
for progress_file in os.listdir(progress_folder):
file_path = os.path.join(progress_folder, progress_file)
if os.path.isfile(file_path) and not file_path.endswith(".csv"):
os.unlink(file_path)
os.mknod("%s%s" % (progress_folder, progress))
time.sleep(1)
if __name__ == '__main__':
while True:
try:
log("scraper started")
if os.path.isfile(OUTPUT_FILE):
os.unlink(OUTPUT_FILE)
manager = Manager()
rules = fetch_rules()
urls = manager.Queue()
fetch_urls()
results = manager.list()
jobs = []
p = Process(target=calculate_progress, args=(urls,))
jobs.append(p)
p.start()
for i in range(THREAD_POOL_SIZE):
log("spawning thread with id %s" % i)
p = Process(target=start_validations, args=(urls, rules, results, i))
jobs.append(p)
p.start()
time.sleep(2)
for j in jobs:
j.join()
save_results(results, OUTPUT_FILE)
log("scraper finished")
except Exception as e:
log(str(e))
As you can see, first I thought I could only have one instance of the browser, so I tried to run at least firefox and chrome in paralel, but this still leaves only a thread to do all the work.
Some times the driver crahsed and the thread stopped working even though it is inside a try/catch block, so I started to get a new instance of the browser everytime this happens,but it still didn't work. I also tried waiting a few seconds between creating each instance of the driver still with no results
here is a pastebin of one of the log files:
https://pastebin.com/TsjZdRYf
A strange thing that I noticed is that almost everytime the only thread that keeps running is the last one spawned (with id 3).
Thanks for your time and you help!
EDIT:
[1] Here is the full code: https://pastebin.com/fvVPwPVb
[2] custom selenium wait condition: https://pastebin.com/Zi7nbNFk

Am I allowed to curse on SO? I solved the problem, and I don't think this answer should exist on SO because nobody else will benefit from it. The problem was a custom wait condition that I had created. This class is in the pastebin that was added in edit 2, but I'll also add it here for convenince:
import time
class selenium_wait_reload:
def __init__(self, desired_repeating_sources):
self.desired_repeating_sources = desired_repeating_sources
self.repeated_pages = 0
self.previous_source = None
def __call__(self, driver):
while True:
current_source = driver.page_source
if current_source == self.previous_source:
self.repeated_pages = self.repeated_pages +1
if self.repeated_pages >= self.desired_repeating_sources:
return True
else:
self.previous_source = current_source
self.repeated_pages = 0
time.sleep(0.3)
The goal of this class was to make selenium wait because the JS could be loading additional DOM.
So, this class makes selenium wait a short time and check the code, wait a little again and check the code again. The class repeats this until the source code is the same 3 times in a row.
The problem is that there are some pages that have a js carrousel, so the source code is never the same. I thought that in cases like this the WebDriverWait second parameter would make it crash with a timeoutexception. I was wrong.

How to scrape all the test match details in cricinfo

I am trying to scrape all the test match details but it is showing HTTP Error 504: Gateway Timeout I am getting the details of test matches but it is not showing this is my code i have used bs4 to scrape the test match details from cricinfo
I need to scrape the details of 2000 test matches this is my code
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('./espncricinfo-fc'):
os.mkdir('./espncricinfo-fc')
for i in range(0, 2000):
soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl =BASE_URL + urljoin(BASE_URL,new_host)
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
f.write(html)
print(html)
else:
print("no html")

this is usually happen when doing multiple request too fast, it can be the server is down or your connection blocked by server firewall, try increase your sleep() or add random sleep.
import random
.....
for i in range(0, 2000):
soupy = BeautifulSoup(....)
time.sleep(random.randint(2,6))

not sure why, seems to be working for me.
I made a few changes in the loop through the links. I'm not sure how you're wanting the output to look in terms of writing it to your file, so I left that part alone. But like I said, seems to be working ok on my end.
import bs4
import requests
import os
import time
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('C:/espncricinfo-fc'):
os.mkdir('C:/espncricinfo-fc')
for i in range(0, 2000):
i=0
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=%s' %i
html = requests.get(url)
print ('Checking page %s of 2000' %(i+1))
soupy = bs4.BeautifulSoup(html.text, 'html.parser')
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + new_host
new_host = odiurl
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('C:/espncricinfo-fc/{0!s}'.format('_'.join(str.split(new_host, "/")[4:])), "wb") as f:
f.write(html)
#print(html)
else:
print("no html")

Python: Facebook Graph API - batch request

I want to make a batch request getting campaigns for a specific ad account. I created a simple code based on this issue
but I've used some global arrays and I don't know if time.sleep(2) is necessary for this code. My code is as below:
from facebookads import FacebookAdsApi
from facebookads.api import FacebookRequest
import pandas as pd
import time
batch_body_responses = []
list_of_artists = [1]
def success_callback(response):
try:
pair = [response.json()['data']]
next = [response.json()['paging']['next']]
batch_body_responses.append(pair)
batch_body_responses.append(next)
except IndexError:
pass
except UnicodeEncodeError:
pass
def error_callback(response):
pass
def generate_batches(iterable, batch_size_limit):
# This function can be found in examples/batch_utils.py
batch = []
for item in iterable:
if len(batch) == batch_size_limit:
yield batch
batch = []
batch.append(item)
if len(batch):
yield batch
def get_id_list(art_search_list):
batches = []
your_app_id = '756885'
your_app_secret = '123456789'
your_access_token = 'EAA.....'
api = FacebookAdsApi.init(your_app_id, your_app_secret, your_access_token)
batch_limit = 25
for batch in generate_batches(art_search_list, batch_limit):
next_batch = api.new_batch()
for artt in batch:
requestss = [FacebookRequest(node_id='act_1234/campaigns',method="GET",endpoint="?fields=id,name")]
for req in requestss:
next_batch.add_request(req, success_callback, error_callback)
batches.append(next_batch)
for batch_request in batches:
batch_request.execute()
time.sleep(2)
print(batch_body_responses)
return batch_body_responses
df = pd.DataFrame(get_id_list(list_of_artists))
How can this code optimized by not using global arrays and how to execute without sleep statement and why it is needed sleep?

How can I increase the speed of this python requests session?

I am using Anaconda - Python 3.5.2
I have a list of 280,000 urls.
I am grabbing the data and trying to keep track of the url-to-data.
I've made about 30K requests. I am averaging 1 request per second.
response_df = pd.DataFrame()
# create the session
with requests.Session() as s:
# loop through the list of urls
for url in url_list:
# call the resource
resp = s.get(url)
# check the response
if resp.status_code == requests.codes.ok:
# create a new dataframe with the response
ftest = json_normalize(resp.json())
ftest['url'] = url
response_df = response_df.append(ftest, ignore_index=True)
else:
print("Something went wrong! Hide your wife! Hide the kids!")
response_df.to_csv(results_csv)

I ended up ditching requests, I used async and aiohttp instead. I was pulling about 1 per second with requests. The new method averages about 5 per second, and only utilizes about 20% of my system resources. I ended up using something very similar to this:
https://www.blog.pythonlibrary.org/2016/07/26/python-3-an-intro-to-asyncio/
import aiohttp
import asyncio
import async_timeout
import os
async def download_coroutine(session, url):
with async_timeout.timeout(10):
async with session.get(url) as response:
filename = os.path.basename(url)
with open(filename, 'wb') as f_handle:
while True:
chunk = await response.content.read(1024)
if not chunk:
break
f_handle.write(chunk)
return await response.release()
async def main(loop):
urls = ["http://www.irs.gov/pub/irs-pdf/f1040.pdf",
"http://www.irs.gov/pub/irs-pdf/f1040a.pdf",
"http://www.irs.gov/pub/irs-pdf/f1040ez.pdf",
"http://www.irs.gov/pub/irs-pdf/f1040es.pdf",
"http://www.irs.gov/pub/irs-pdf/f1040sb.pdf"]
async with aiohttp.ClientSession(loop=loop) as session:
for url in urls:
await download_coroutine(session, url)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
also, this was helpful:
https://snarky.ca/how-the-heck-does-async-await-work-in-python-3-5/
http://www.pythonsandbarracudas.com/blog/2015/11/22/developing-a-computational-pipeline-using-the-asyncio-module-in-python-3

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Unexpected Multithreading Output when Web Scraping with Selenium (Python) - python-3.x

Related

How to compare between two requests using Thread

How to make selenium threads run (each thread with its own driver)

How to scrape all the test match details in cricinfo

Python: Facebook Graph API - batch request

How can I increase the speed of this python requests session?

Categories

Resources