How can I increase the speed of this python requests session? - python-3.x

I am using Anaconda - Python 3.5.2
I have a list of 280,000 urls.
I am grabbing the data and trying to keep track of the url-to-data.
I've made about 30K requests. I am averaging 1 request per second.
response_df = pd.DataFrame()
# create the session
with requests.Session() as s:
# loop through the list of urls
for url in url_list:
# call the resource
resp = s.get(url)
# check the response
if resp.status_code ==
# create a new dataframe with the response
ftest = json_normalize(resp.json())
ftest['url'] = url
response_df = response_df.append(ftest, ignore_index=True)
print("Something went wrong! Hide your wife! Hide the kids!")

I ended up ditching requests, I used async and aiohttp instead. I was pulling about 1 per second with requests. The new method averages about 5 per second, and only utilizes about 20% of my system resources. I ended up using something very similar to this:
import aiohttp
import asyncio
import async_timeout
import os
async def download_coroutine(session, url):
with async_timeout.timeout(10):
async with session.get(url) as response:
filename = os.path.basename(url)
with open(filename, 'wb') as f_handle:
while True:
chunk = await
if not chunk:
return await response.release()
async def main(loop):
urls = ["",
async with aiohttp.ClientSession(loop=loop) as session:
for url in urls:
await download_coroutine(session, url)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
also, this was helpful:


How can I make web resources avialable offline?

Their is a folder in my PC with Linux OS, which contains a website (webpages etc.). The webpages and other complimentary files in the folder use cdns to bring resources like jquery, datatables etc.
I want to make these resources offline. I know I can manually search all files for occurrence of "http" keyword, download files from these URLs keep them in folder and accordingly change source file path. But as these are too many files it seems troublesome. I want to ask is there any better and elegant way of doing so. Thanks in advance
I made a python script to do the job:
import re
import os
import aiohttp
import asyncio
import pathlib
import string
import random
import chardet
# Decode byte sequence using chardet to avoid "Type error"
def decode_bytes(byte_sequence):
result = chardet.detect(byte_sequence)
encoding = result['encoding']
return byte_sequence.decode(encoding)
VALID_URL_REGEX = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
# Downloader, I lazily have used resp.status as success criteria but it have logical issue you can also include other logic
async def download_file(session, url, local_path):
async with session.get(url, allow_redirects=True, ssl=False) as resp:
if resp.status == 200:
print("Content path is "+str(local_path))
with open(local_path, "wb") as f:
while True:
chunk = await
if not chunk:
chunk = chunk.encode("utf-8")
downloaded_urls = set()
async def process_file(file_path, session):
print("File during Read "+str(file_path))
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
contents =
contents = decode_bytes(contents)
except UnicodeDecodeError as e:
# To avoid Type error
print(f"Error decoding file {file_path}: {e}")
urls = re.findall(VALID_URL_REGEX, contents)
for url in urls:
file_name = url.split("/")[-1]
if len(file_name)==0:
if url in downloaded_urls:
local_path = downloaded_urls[url]
# generating random strings to avoid same file name but different urls
res = ''.join(random.choices(string.ascii_uppercase +string.digits, k=5))
local_path = os.path.join("downloaded", file_name)
if not os.path.exists(local_path):
await download_file(session, url, local_path)
# To avoid redownloading
contents = contents.replace(url, local_path)
print("File during write "+str(file_path))
with open(file_path, "w", encoding="utf-8", errors="ignore") as f:
async def process_directory(directory):
if not os.path.exists("downloaded"):
conn = aiohttp.TCPConnector(limit=2200,limit_per_host=20,ttl_dns_cache=22)
async with aiohttp.ClientSession(connector=conn) as session:
tasks = []
for filepath in pathlib.Path(directory).glob('**/*'):
if str(fp).endswith(".md") or str(fp).endswith(".txt"):
if os.path.isfile(fp):
tasks.append(process_file(fp, session))
await asyncio.gather(*tasks)
if __name__ == '__main__':
directory = input("Enter root directory")
I will also try "substitution" module and update answer accordingly.

Unexpected Multithreading Output when Web Scraping with Selenium (Python)

from selenium import webdriver
from import Options
from import ChromeDriverManager
from import By
from concurrent.futures import ThreadPoolExecutor
import time
# Current time is :48.77885s per Page, 4.4344 per Region
options = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
def list_to_csv(summoner_info):
summoner_info = set([tuple(summoner) for summoner in summoner_info])
with open('high_elo_summoners.csv', 'w', encoding='utf-8') as f:
for summoner in summoner_info:
def gather_summoner_info(url):
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, 'rt-tr')
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split('\n')
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(' ')[0].replace(',', ''))])
return summoner_info
def get_summoner_data(page_count, regions):
links = [f'{region}&page={page + 1}' for page in range(page_count) for
region in regions]
# Gather all the relevant summoner information on the page
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {url : executor.submit(gather_summoner_info, url) for url in links}
for url, future in future_results.items():
def main():
page_count = 1
regions = ['na1', 'euw1', 'eun1', 'kr', 'br1', 'jp1', 'ru', 'oc1', 'tr1', 'la1', 'la2']
get_summoner_data(page_count, regions)
if __name__ == '__main__':
s = time.perf_counter()
e = time.perf_counter()
print(e - s)
Issue: Code is returning the same output for each iteration (The first link of the links list)
Above the following code pulls some information from the links variable using selenium. The issue is that when the threads are executing in the get_summoner_data() function, it is returning the same results every time. I'm not sure what the issue is coming from as the different links will print from each gather_summoner_info() call.
Currently it is just returning the information from the very first link. Not sure what is causing the issue, any help is appreciated.
Try running without --headless option. You will see what's going on.
You created only one instance of web driver and that one is being used for all the threaded tasks. Multiple threads try to load different URLs on this single driver and finally it is very likely that the lastly tried URL will be loaded all the time.
Simple fix is to create a driver instance for every thread.
You can do this by moving the line creating a web driver into the thread task function gather_summoner_info as below. I tried with this fix and it works correctly.
def gather_summoner_info(url):
##### moved ######
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.implicitly_wait(5) # Wait until the CSS Selector is available
summoner_info = []
content = driver.find_elements(By.CLASS_NAME, "rt-tr")
for index, con in enumerate(content):
if index != 0:
summoner = con.text.split("\n")
summoner_info.append([summoner[1], summoner[2], int(summoner[3].split(" ")[0].replace(",", ""))])
return summoner_info
Further Consideration
As you know, creating a new web driver instance is resource expensive. If you are just trying to scrape information, HTTP requests are enough most of the times.
For the website you are trying to scrape, I found that the job can be done using HTTP requests only. I revised the script without using Selenium and it takes less than 1 second to load all the leaderboards for all regions.
import json
import time
from concurrent.futures import ThreadPoolExecutor
import requests
def list_to_csv(summoner_info):
summoner_info = sorted(summoner_info, key=lambda x: int(x[2]), reverse=True)
with open("result.csv", "w", encoding="utf-8") as f:
f.write("\n".join([",".join(item) for item in summoner_info]))
def gather_summoner_info(region: str):
payload = json.dumps(
"operationName": "getRankedLeaderboard",
"variables": {"page": 1, "queueType": 420, "regionId": region},
"query": "query getRankedLeaderboard($page: Int, $queueType: Int, $regionId: String!) {\n leaderboardPage(page: $page, queueType: $queueType, regionId: $regionId) {\n totalPlayerCount\n topPlayerMostPlayedChamp\n players {\n iconId\n losses\n lp\n overallRanking\n rank\n summonerLevel\n summonerName\n tier\n wins\n __typename\n }\n __typename\n }\n}\n",
headers = {"Content-Type": "application/json"}
response ="", headers=headers, data=payload)
summoner_info = []
data = response.json()
for player in data["data"]["leaderboardPage"]["players"]:
summoner_info.append((player["summonerName"], player["tier"], player["lp"]))
return summoner_info
def get_summoner_data(page_count, regions):
agg_summoner_info = []
with ThreadPoolExecutor(max_workers=20) as executor:
future_results = {r: executor.submit(gather_summoner_info, r) for r in regions}
for _, future in future_results.items():
def main():
page_count = 1
regions = ["na1", "euw1", "eun1", "kr", "br1", "jp1", "ru", "oc1", "tr1", "la1", "la2"]
get_summoner_data(page_count, regions)
if __name__ == "__main__":
s = time.perf_counter()
e = time.perf_counter()
print(e - s)

How to compare between two requests using Thread

I have created a simple thread request code:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
def doScrape(response):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
return {
'url': response.url,
'repository_results': t.text.strip()
def doRequest(url):
response = requests.get(url)
time.sleep(random.randint(1, 3))
return response
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
) for url in URLS]
for future in as_completed(future_tasks):
response = future.result()
if response.status_code == 200:
result = doScrape(response)
while True:
t = threading.Thread(target=ourLoop, )
print('Joining thread and waiting for it to finish...')
where I first start a thread with a ThreadPoolExecutor that has a workers of 2. The idea of this is that I want to be able to monitor 24/7 and notify myself whenever there has been a change (in this case if the repository_results is different between previous requests vs. latest request) - whenever there is a change, I want to print out that says that there is a difference.
I wonder how I am able to do that using ThreadPoolExecutor and how I can monitor a specific url to see if there has happend a change nor not?
You can do this by storing the previous results in the list itself, and passing that along with the response to doScrape
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
result = {'url': url_dict['url'], 'respository_results': None, 'change': False}
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
result['change'] = True
result['respository_results'] = current_response
return result
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
result = doScrape(response, url_dict)
if result['change']:
print(f'Changed for url : {result["url"]}!')
while True:
t = threading.Thread(target=ourLoop, )
print('Joining thread and waiting for it to finish...')
The only exception where this fails is if the change happened at the very first time you are running the loop, since we would not know the previous value of the scraped element.
Also, if you are planning to run this on loop and only want to print in case their is a change, make sure to change the repository_result key in the url_dict itself (inside doScrape), and you can omit the return results line as well:
import random
import threading
import time
from concurrent.futures import as_completed
from concurrent.futures.thread import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
URLS = [
# Create a list of dictionaries with urls and their previous result as None
url_ = []
for url in URLS:
url_.append({'url': url, 'repository_results': None})
def doScrape(response, url_dict):
soup = BeautifulSoup(response.text, 'html.parser')
t = soup.find("div", {"class": "codesearch-results"}).find("h3")
current_response = t.text.strip()
# If prev result do not match current result, set key 'change' as True, only exception being if the
# previous result was None, i.e, this is the first time we are running this
if current_response != url_dict['repository_results'] and url_dict['repository_results'] is not None:
print(f'Changed for url : {url_dict["url"]}')
url_dict['respository_results'] = current_response
def doRequest(url_dict):
response = requests.get(url_dict['url'])
time.sleep(random.randint(1, 3))
return response, url_dict
def ourLoop():
with ThreadPoolExecutor(max_workers=2) as executor:
future_tasks = [
) for url_dict in url_]
for future in as_completed(future_tasks):
response, url_dict = future.result()
if response.status_code == 200:
doScrape(response, url_dict)
while True:
t = threading.Thread(target=ourLoop, )
print('Joining thread and waiting for it to finish...')

How to combine the simultaneous execution of Telegram bot on aiogram, and asynchronous parser?

I want to make a service to monitor prices from a particular site on Python.
When the price changes, there will be a notification to specific users. It is important to have one executable file for the project.
Collection of information to check for changes is asynchronous. This is done to speed up the work.
Telegram bot is developed on Aiogram. The functionality of adding and removing items for monitoring is implemented.
Adding asynchronous function to the body of the bot, which in turn calls another asynchronous function from another file leads to an error:
D:\programs\python\lib\asyncio\ RuntimeWarning: coroutine
'gather_data' was never awaited
handle = None # Needed to break cycles when an exception occurs.
RuntimeWarning: Enable tracemalloc to get the object allocation traceback
Asynchronous function code from the bot file:
async def periodic(sleep_for):
while True:
await asyncio.sleep(sleep_for)
differences = fetch_liquids()
if differences:
with open('base.json', 'r', encoding='utf8') as file:
base = json.load(file)
for element in differences:
for user in base:
if differences[element].get('title') in base[user]['preferences']:
print('there is a message')
if __name__ == '__main__':
loop = asyncio.get_event_loop()
Asynchronous function code from the parser file:
async def gather_data():
async with aiohttp.ClientSession() as session:
response = await session.get('url', headers=headers, cookies=cookies)
soup = BeautifulSoup(await response.text(), 'lxml')
pages = int(soup.find('div', class_='pagination__list').find_all('a')[-1].text)
except AttributeError:
pages = 1
tasks = []
for page in range(1, pages + 1):
task = asyncio.create_task(get_page_data(session, page))
await asyncio.gather(*tasks)
def main():
if not os.path.isfile(f'jsons/liquids.json'):
with open('jsons/liquids.json', 'w', encoding='utf8') as file:
json.dump(liquids, file, indent=4, ensure_ascii=False)
with open('jsons/liquids.json', 'r', encoding='utf8') as file:
old_liquids = json.load(file)
return collate_data(old_liquids)

Python-asyncio and subprocess deployment on IIS: returning HTTP response without running another script completely

I'm facing an issue in creating Realtime status update for merging new datasets with old one and machine learning model creation results via Web framework. The tasks are simple in following steps.
An user/ client will send a new datasets in .CSV file to the server,
On server side my windows machine will receive a file then send an acknowledge,
Merge the new dataset with the old one for new machine learning model creation and
Run another python script(that is to create a new sequential deep-learning model). After the successful completion of another python script my code have to return the response to the client!
I have deployed my python-flask application on IIS-10. To run an another python script, this main flask-api script should have to wait for completing that model creation script. On model creation python script it contains several process like loading datasets, tokenizing, oneHot Encoding, padding techniques, model training for 100 epochs and finally prediction results.
My exact goal is this Flask-API should have to wait for until completing the entire process. I'm sure definitely it will take 8-9 minutes to complete the whole script mentioned in While testing this code on development mode it's working excellently without any issues! But while testing it on production mode on IIS no it's not waiting for the whole process and within 6-7 seconds it returning response to the client.
For debugging purpose I included logging to record all events in both Flask script and machine learning model creation script! Through that I came to understand that model creation script only ran 10%!. First I tried simple methods with async def and await to run the it didn't make any sense! Then I included threading and get_event_loop() and then run_until_complete() to make my parent code wait until finishing the whole process. But finally I'm helpless!! I couldn't able to find a rightful solution. Please let me know what I did wrong.. Thank you.
Python 3.7.9
Windows server 2019 and
IIS 10.0 Express
My code:
import os
import time
import glob
import subprocess
import pandas as pd
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from datetime import datetime
import logging
import asyncio
from concurrent.futures import ThreadPoolExecutor
ALLOWED_EXTENSIONS = {'csv', 'xlsx'}
_executor = ThreadPoolExecutor(1)
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = "C:\\inetpub\\wwwroot\\iAssist_IT_support\\New_IT_support_datasets"
currentDateTime =
filenames = None
logger = logging.getLogger(__name__)
formatter = logging.Formatter('%(asctime)s:%(name)s:%(message)s')
file_handler = logging.FileHandler('model-creation-status.log')
# stream_handler = logging.StreamHandler()
# stream_handler.setFormatter(formatter)
# app.logger.addHandler(stream_handler)
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def home():
return jsonify("Hello, This is a file-upload API, To send the file, use")
#app.route('/file_upload/status1', methods=['POST'])
def upload_file():
app.logger.debug("/file_upload/status1 is execution")
# check if the post request has the file part
if 'file' not in request.files:
app.logger.debug("No file part in the request")
response = jsonify({'message': 'No file part in the request'})
response.status_code = 400
return response
file = request.files['file']
if file.filename == '':
app.logger.debug("No file selected for uploading")
response = jsonify({'message': 'No file selected for uploading'})
response.status_code = 400
return response
if file and allowed_file(file.filename):
filename = secure_filename(file.filename)['UPLOAD_FOLDER'], filename))
app.logger.debug("Spreadsheet received successfully")
response = jsonify({'message': 'Spreadsheet uploaded successfully'})
response.status_code = 201
return response
app.logger.debug("Allowed file types are csv or xlsx")
response = jsonify({'message': 'Allowed file types are csv or xlsx'})
response.status_code = 400
return response
#app.route('/file_upload/status2', methods=['POST'])
def status1():
global filenames
app.logger.debug("file_upload/status2 route is executed")
if request.method == 'POST':
# Get data in json format
if request.get_json():
filenames = request.get_json()
filenames = filenames['data']
# print(filenames)
folderpath = glob.glob('C:\\inetpub\\wwwroot\\iAssist_IT_support\\New_IT_support_datasets\\*.csv')
latest_file = max(folderpath, key=os.path.getctime)
# print(latest_file)
if filenames in latest_file:
df1 = pd.read_csv("C:\\inetpub\\wwwroot\\iAssist_IT_support\\New_IT_support_datasets\\" +
filenames, names=["errors", "solutions"])
df1 = df1.drop(0)
# print(df1.head())
df2 = pd.read_csv("C:\\inetpub\\wwwroot\\iAssist_IT_support\\existing_tickets.csv",
names=["errors", "solutions"])
combined_csv = pd.concat([df2, df1])
index=False, encoding='utf-8-sig')
# return redirect('/file_upload/status2')
return jsonify('New data merged with existing datasets')
#app.route('/file_upload/status3', methods=['POST'])
def status2():
app.logger.debug("file_upload/status3 route is executed")
if request.method == 'POST':
# Get data in json format
if request.get_json():
message = request.get_json()
message = message['data']
return jsonify("New model training is in progress don't upload new file")
#app.route('/file_upload/status4', methods=['POST'])
def model_creation():
app.logger.debug("file_upload/status4 route is executed")
if request.method == 'POST':
# Get data in json format
if request.get_json():
message = request.get_json()
message = message['data']
def model_run():
app.logger.debug("model script starts to run")"python C:\\.....\\IT_support_chatbot-master\\"
"Python_files\\", shell=True)
# time.sleep(20)
app.logger.debug("script ran successfully")
async def subprocess_call():
# run blocking function in another thread,
# and wait for it's result:
app.logger.debug("sub function execution starts")
await loop.run_in_executor(_executor, model_run)
loop = asyncio.get_event_loop()
return jsonify("Model created successfully for sent file %s" % filenames)
if __name__ == "__main__":
