I have created a simple thread request code:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
def doScrape(response):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
return {
'url': response.url,
'repository_results': t.text.strip()
}
def doRequest(url):
response = requests.get(url)
time.sleep(random.randint(1, 3))
return response
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url
) for url in URLS]
for future in as_completed(future_tasks):
response = future.result()
if response.status_code == 200:
result = doScrape(response)
print(result)
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
where I first start a thread with a ThreadPoolExecutor that has a workers of 2. The idea of this is that I want to be able to monitor 24/7 and notify myself whenever there has been a change (in this case if the repository_results is different between previous requests vs. latest request) - whenever there is a change, I want to print out that says that there is a difference.
I wonder how I am able to do that using ThreadPoolExecutor and how I can monitor a specific url to see if there has happend a change nor not?
You can do this by storing the previous results in the list itself, and passing that along with the response to doScrape
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
result = {'url': url_dict['url'], 'respository_results': None, 'change': False}
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
result['change'] = True
result['respository_results'] = current_response
return result
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url_dict
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
result = doScrape(response, url_dict)
print(result)
if result['change']:
print(f'Changed for url : {result["url"]}!')
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
The only exception where this fails is if the change happened at the very first time you are running the loop, since we would not know the previous value of the scraped element.
Also, if you are planning to run this on loop and only want to print in case their is a change, make sure to change the repository_result key in the url_dict itself (inside doScrape), and you can omit the return results line as well:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
'https://github.com/search?q=hello+world',
'https://github.com/search?q=python+3',
'https://github.com/search?q=world',
'https://github.com/search?q=i+love+python',
'https://github.com/search?q=sport+today',
'https://github.com/search?q=how+to+code',
'https://github.com/search?q=banana',
'https://github.com/search?q=android+vs+iphone',
'https://github.com/search?q=please+help+me',
'https://github.com/search?q=batman',
]
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
print(f'Changed for url : {url_dict["url"]}')
url_dict['respository_results'] = current_response
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
executor.submit(
doRequest,
url_dict
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
doScrape(response, url_dict)
while True:
t = threading.Thread(target=ourLoop, )
t.start()
print('Joining thread and waiting for it to finish...')
t.join()
Related
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from concurrent.futures import ThreadPoolExecutor
import time
# Current time is :48.77885s per Page, 4.4344 per Region
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def list_to_csv(summoner_info):
summoner_info = set([tuple(summoner) for summoner in summoner_info])
with open('high_elo_summoners.csv', 'w', encoding='utf-8') as f:
for summoner in summoner_info:
f.write(f"{summoner[0]},{summoner[1]},{summoner[2]}\n")
def gather_summoner_info(url):
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, 'rt-tr')
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split('\n')
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(' ')[0].replace(',', ''))])
else:
pass
return summoner_info
def get_summoner_data(page_count, regions):
links = [f'https://u.gg/leaderboards/ranking?region={region}&page={page + 1}' for page in range(page_count) for
region in regions]
# Gather all the relevant summoner information on the page
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {url : executor.submit(gather_summoner_info, url) for url in links}
for url, future in future_results.items():
#print(future.result())
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ['na1', 'euw1', 'eun1', 'kr', 'br1', 'jp1', 'ru', 'oc1', 'tr1', 'la1', 'la2']
get_summoner_data(page_count, regions)
if __name__ == '__main__':
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)
Issue: Code is returning the same output for each iteration (The first link of the links list)
Above the following code pulls some information from the links variable using selenium. The issue is that when the threads are executing in the get_summoner_data() function, it is returning the same results every time. I'm not sure what the issue is coming from as the different links will print from each gather_summoner_info() call.
Currently it is just returning the information from the very first link. Not sure what is causing the issue, any help is appreciated.
Approach
Try running without --headless option. You will see what's going on.
Problem
You created only one instance of web driver and that one is being used for all the threaded tasks. Multiple threads try to load different URLs on this single driver and finally it is very likely that the lastly tried URL will be loaded all the time.
Fix
Simple fix is to create a driver instance for every thread.
You can do this by moving the line creating a web driver into the thread task function gather_summoner_info as below. I tried with this fix and it works correctly.
def gather_summoner_info(url):
##### moved ######
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
##################
driver.get(url)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, "rt-tr")
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split("\n")
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(" ")[0].replace(",", ""))])
else:
pass
return summoner_info
Further Consideration
As you know, creating a new web driver instance is resource expensive. If you are just trying to scrape information, HTTP requests are enough most of the times.
For the website you are trying to scrape, I found that the job can be done using HTTP requests only. I revised the script without using Selenium and it takes less than 1 second to load all the leaderboards for all regions.
import json
import time
from concurrent.futures import ThreadPoolExecutor
import requests
def list_to_csv(summoner_info):
summoner_info = sorted(summoner_info, key=lambda x: int(x[2]), reverse=True)
with open("result.csv", "w", encoding="utf-8") as f:
f.write("\n".join([",".join(item) for item in summoner_info]))
def gather_summoner_info(region: str):
payload = json.dumps(
{
"operationName": "getRankedLeaderboard",
"variables": {"page": 1, "queueType": 420, "regionId": region},
"query": "query getRankedLeaderboard($page: Int, $queueType: Int, $regionId: String!) {\n leaderboardPage(page: $page, queueType: $queueType, regionId: $regionId) {\n totalPlayerCount\n topPlayerMostPlayedChamp\n players {\n iconId\n losses\n lp\n overallRanking\n rank\n summonerLevel\n summonerName\n tier\n wins\n __typename\n }\n __typename\n }\n}\n",
}
)
headers = {"Content-Type": "application/json"}
response = requests.post("https://u.gg/api", headers=headers, data=payload)
summoner_info = []
data = response.json()
for player in data["data"]["leaderboardPage"]["players"]:
summoner_info.append((player["summonerName"], player["tier"], player["lp"]))
return summoner_info
def get_summoner_data(page_count, regions):
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {r: executor.submit(gather_summoner_info, r) for r in regions}
for _, future in future_results.items():
agg_summoner_info.extend(future.result())
list_to_csv(agg_summoner_info)
def main():
page_count = 1
regions = ["na1", "euw1", "eun1", "kr", "br1", "jp1", "ru", "oc1", "tr1", "la1", "la2"]
get_summoner_data(page_count, regions)
if __name__ == "__main__":
s = time.perf_counter()
main()
e = time.perf_counter()
print(e - s)
import time
from bs4 import BeautifulSoup
import requests
from urllib.request import Request, urlopen
pages = ["movies", "series"]
printed = []
for page in pages:
req = Request("https://www.thenetnaija.com/videos/" + page, headers={'User-Agent': 'XYZ/3.0'})
webpage = urlopen(req, timeout=10)
b4 = BeautifulSoup(webpage, "html.parser")
movie_list = b4.find_all("div", {"class" : "video-files"})
for allContainers in movie_list:
filmName = allContainers.find('img').get('alt')
printed.append(filmName)
print(printed)
for get in printed:
requests.get("https://api.telegram.org/bot:AAEapVykIXdphGYaH5ZjXuhpFaFw7wpi5Bs/sendMessage?chat_id=&text={}".format(get))
I want to use a while loop to let the program run infinitely and only send the requests to my telegram chat if the data in the list has changed.
You can use this example as a basis how to check the movies/series periodically (The example is using set.difference to determine if there are changes):
import time
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
def get_movies(url):
headers = {"User-Agent": "XYZ/3.0"}
req = Request(url, headers=headers)
b4 = BeautifulSoup(urlopen(req, timeout=10), "html.parser")
return set(a.get_text(strip=True) for a in b4.select("h2 a"))
url = "https://www.thenetnaija.com/videos/{}"
pages = {
"movies": get_movies(url.format("movies")),
"series": get_movies(url.format("series")),
}
while True:
time.sleep(10) # <-- sleep 10sec before checking again
for k, v in pages.items():
new_movies = get_movies(url.format(k))
difference = new_movies.difference(v)
if difference:
print("New {}:".format(k))
print(difference)
pages[k] = new_movies
# do stuff here (post to telegram etc.)
# ...
else:
print("No new {}".format(k))
I am using Anaconda - Python 3.5.2
I have a list of 280,000 urls.
I am grabbing the data and trying to keep track of the url-to-data.
I've made about 30K requests. I am averaging 1 request per second.
response_df = pd.DataFrame()
# create the session
with requests.Session() as s:
# loop through the list of urls
for url in url_list:
# call the resource
resp = s.get(url)
# check the response
if resp.status_code == requests.codes.ok:
# create a new dataframe with the response
ftest = json_normalize(resp.json())
ftest['url'] = url
response_df = response_df.append(ftest, ignore_index=True)
else:
print("Something went wrong! Hide your wife! Hide the kids!")
response_df.to_csv(results_csv)
I ended up ditching requests, I used async and aiohttp instead. I was pulling about 1 per second with requests. The new method averages about 5 per second, and only utilizes about 20% of my system resources. I ended up using something very similar to this:
https://www.blog.pythonlibrary.org/2016/07/26/python-3-an-intro-to-asyncio/
import aiohttp
import asyncio
import async_timeout
import os
async def download_coroutine(session, url):
with async_timeout.timeout(10):
async with session.get(url) as response:
filename = os.path.basename(url)
with open(filename, 'wb') as f_handle:
while True:
chunk = await response.content.read(1024)
if not chunk:
break
f_handle.write(chunk)
return await response.release()
async def main(loop):
urls = ["http://www.irs.gov/pub/irs-pdf/f1040.pdf",
"http://www.irs.gov/pub/irs-pdf/f1040a.pdf",
"http://www.irs.gov/pub/irs-pdf/f1040ez.pdf",
"http://www.irs.gov/pub/irs-pdf/f1040es.pdf",
"http://www.irs.gov/pub/irs-pdf/f1040sb.pdf"]
async with aiohttp.ClientSession(loop=loop) as session:
for url in urls:
await download_coroutine(session, url)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
also, this was helpful:
https://snarky.ca/how-the-heck-does-async-await-work-in-python-3-5/
http://www.pythonsandbarracudas.com/blog/2015/11/22/developing-a-computational-pipeline-using-the-asyncio-module-in-python-3
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
def get_total_pages():
tut = []
base_url = 'Your group '
for url in [base_url % i for i in range(1, 27)]:
tut.append(url)
print(tut)
#get_data_from_page(tut)
pool = ThreadPool(8)
results = pool.map(get_data_from_page, tut)
def get_data_from_page(tut):
f = open("emails.txt", 'a')
email = []
for a in tut:
link = s.get(a).text
soup = BeautifulSoup(link, 'lxml')
links = soup.find('div', class_="mens").find_all('span', class_="inviz")
for e in links:
emails = e.text
f.write(emails + ', ')
email.append(emails)
print(email)
def main():
get_total_pages()
if __name__ == '__main__':
main()
This results in an error saying it only works with multiprocessing, and:
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
problem was in this
for a in tut:
link = s.get(a).text
and was needed
just
link = s.get(a).text
#without for
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
#s = Session()
def get_photo_from_page():
tut = []
r = requests.get('https://vk.com/uporols_you').text
soup = BeautifulSoup(r, 'lxml')
im = soup.find_all('img', class_="ph_img")
for a in im:
s = a.get('data-src_big').split('|')[0]
tut.append(s)
y = "img%s.jpg"
for t, im in tut, [y % i for i in range(1,5)]:
p = requests.get(t)
out = open(im, "wb")
out.write(p.content)
out.close()
def main():
get_photo_from_page()
if __name__ == '__main__':
main()
error from cmd for t, im in tut, [y % i for i in range(1,5)]:
ValueError: too many values to unpack (expected 2)
> I need to list with a 1 to 1 accrue to URL, and on passage possylke,
and save all images with the new name, in separate cycles, it always
takes the last available reference and stores it as the number of
times indicated in the cycle.
import requests
from requests import Session
from bs4 import BeautifulSoup
import re
from multiprocessing.dummy import Pool as ThreadPool
#s = Session()
def get_photo_from_page():
tut = []
r = requests.get('https://m.vk.com/uporols_you').text
soup = BeautifulSoup(r, 'lxml')
im = soup.find_all('img', class_="ph_img")
try:
for a in im:
s = a.get('data-src_big').split('|')[0]
tut.append(s)
print(tut)
except:
print('no have any links)')
for num, link in enumerate(tut, start=1):
p = requests.get(link)
out = open("img%s.jpg" % (num), 'wb')
out.write(p.content)
out.close()
def main():
get_photo_from_page()
if __name__ == '__main__':
main()