I have asyncio crawler, that visits URLs and collects new URLs from HTML responses. I was inspired that great tool: https://github.com/aio-libs/aiohttp/blob/master/examples/legacy/crawl.py
Here is a very simplified piece of workflow, how it works:
import asyncio
import aiohttp
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
What I need now is to add some very slow beautifulsoap based function. I need that function do not block main loop and work as background process. For instance, I will handle HTTP responses.
I read python docs about it and found that: https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor
I tried to add it to my code, but it does not work as should (I use cpu_bound only for demo):
import asyncio
import aiohttp
import concurrent.futures
def cpu_bound():
return sum(i * i for i in range(10 ** 7))
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
####### Blocking operation #######
loop = asyncio.get_running_loop()
with concurrent.futures.ProcessPoolExecutor() as pool:
result = await loop.run_in_executor(pool, cpu_bound)
print('custom process pool', result)
#################################
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
For now, it doesn't work as expected, it blocks HTTP requests every time:
URL: https://www.google.com have code: 200
custom process pool 333333283333335000000
URL: https://images.google.com have code: 200
custom process pool 333333283333335000000
URL: https://maps.google.com have code: 200
custom process pool 333333283333335000000
URL: https://mail.google.com have code: 200
custom process pool 333333283333335000000
URL: https://news.google.com have code: 200
custom process pool 333333283333335000000
URL: https://video.google.com have code: 200
custom process pool 333333283333335000000
How to correctly put the task in the background inside the main asyncio process?
Are there best practices on how to do that in a simple way, or I should use Redis for task planning?
I believe that since you are setting your BoundedSemaphore to 1 it is only allowing one instance of your task to run at a time.
You can use the ratelimiter package to limit the number of concurrent requests in a certain amount of time.
I would also upload code that works for me. It is two independent async queues, and one of them spawn high-CPU consumption process in a separate loop:
import asyncio
import functools
import aiohttp
import concurrent.futures
def cpu_bound(num):
return sum(i * i for i in range(10 ** num))
class Requester:
def __init__(self):
self.threads = 3
self.threads2 = 10
self.pool = concurrent.futures.ProcessPoolExecutor()
async def fetch(self, url):
try:
timeout = aiohttp.ClientTimeout(total=10)
async with self.client.get(url, allow_redirects=False, verify_ssl=False, timeout=timeout) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
resp_list = {'url': str(response.real_url), 'data': str(data), 'headers': dict(response.headers)}
return resp_list
except Exception as err:
print(err)
return {}
async def heavy_worker(self, a):
while True:
resp_list = await a.get()
if resp_list.keys():
####### Blocking operation #######
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(self.pool, functools.partial(cpu_bound, num=5))
print('wappalazer', result)
except Exception as err:
print(err)
#################################
a.task_done()
else:
a.task_done()
async def fetch_worker(self, q, a):
while True:
url = await q.get()
resp_list = await self.fetch(url)
q.task_done()
await a.put(resp_list)
async def main(self, urls):
# Create an queues those we will use to store our "workload".
q = asyncio.Queue()
a = asyncio.Queue()
# Create workers tasks to process the queue concurrently.
workers_fetch = [asyncio.create_task(self.fetch_worker(q, a)) for _ in range(self.threads)]
workers_heavy = [asyncio.create_task(self.heavy_worker(a)) for _ in range(self.threads2)]
for url in urls:
await q.put(url)
# wait for all tasks to be processed
await q.join()
await a.join()
# Cancel our worker tasks.
for worker in workers_fetch:
worker.cancel()
await asyncio.gather(*workers_fetch , return_exceptions=True)
for worker in workers_heavy:
worker.cancel()
await asyncio.gather(*workers_heavy , return_exceptions=True)
async def run(self, _urls_list):
async with aiohttp.ClientSession() as self.client:
task_for_first_run = asyncio.create_task(self.main(_urls_list))
await asyncio.sleep(1)
await task_for_first_run
print("All tasks completed")
def http_crawl(self, _urls_list):
asyncio.run(self.run(_urls_list))
r = Requester()
_url_list = ['http://aaaaaaaaaaaaaaaa.aaaaaaaaaaaaaaaaaaa.aa', 'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com', 'https://www.google.com',
'https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com',
'https://video.google.com','https://books.google.com', 'https://www.google.com','https://images.google.com',
'https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com',
'https://books.google.com', 'https://www.google.com','https://images.google.com','https://maps.google.com',
'https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com',
'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
I have 2 URLs and 60k+ requests. Basically, I need to post every request to both URLs, then compare their responses, but not to wait for the response to post another request.
I've tried to do it with aiohttp and asyncio
import asyncio
import time
import aiohttp
import os
from aiofile import AIOFile
testURL = ""
prodURL = ""
directoryWithRequests = ''
directoryToWrite = ''
headers = {'content-type': 'application/soap+xml'}
i = 1
async def fetch(session, url, reqeust):
global i
async with session.post(url=url, data=reqeust.encode('utf-8'), headers=headers) as response:
if response.status != 200:
async with AIOFile(directoryToWrite + str(i) + '.xml', 'w') as afp:
await afp.write(reqeust)
i += 1
return await response.text()
async def fetch_all(session, urls, request):
results = await asyncio.gather(*[asyncio.create_task(fetch(session, url, request)) for url in urls])
return results
async def asynchronousRequests(requestBody):
urls = [testURL, prodURL]
global i
with open(requestBody) as my_file:
body = my_file.read()
async with aiohttp.ClientSession() as session:
htmls = await fetch_all(session, urls, body)
# some conditions
async def asynchronous():
try:
start = time.time()
futures = [asynchronousRequests(directoryWithRequests + i) for i in os.listdir(directoryWithRequests)]
for future in asyncio.as_completed(futures):
result = await future
print("Process took: {:.2f} seconds".format(time.time() - start))
except Exception as e:
print(str(e))
if __name__ == '__main__':
try:
# AsyncronTest
ioloop = asyncio.ProactorEventLoop()
ioloop.run_until_complete(asynchronous())
ioloop.close()
if i == 1:
print('Regress is OK')
else:
print('Number of requests to check = {}'.format(i))
except Exception as e:
print(e)
I believe that the code above works, but it creates N futures, where the N equals to the number of request files. This brings to sort of ddos because the server can't response to that number of requests at the same time.
Found suitable solution. Basically it's just 2 async tasks:
tasks = [
postRequest(testURL, client, body),
postRequest(prodURL, client, body)
]
await asyncio.wait(tasks)
It's not the same performance as the code in the question with afortable number of requests, but as least it doesn't ddos the server that much.
import random
import asyncio
import json
import aiohttp
import sys
import urllib
from lxml.html.soupparser import parse
from aiohttp import ClientSession
from threading import Thread
def ttest():
async def fetch(url, session):
headers = {
'Host': 'example.com'
}
cookies2 = {
'test': 'test'
}
data = '{"test":"test"}'
async with session.post(url, data=data, headers=headers, cookies=cookies2) as response:
return await response.read()
async def bound_fetch(sem, url, session):
async with sem:
html = await fetch(url, session)
print(html)
async def run(r):
url = "https://test.com"
tasks = []
sem = asyncio.Semaphore(1000)
async with aiohttp.ClientSession() as session:
for i in range(r):
task = asyncio.ensure_future(bound_fetch(sem, url, session))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses
number = 1
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)
ttest()
This is the error: TypeError: _request() got an unexpected keyword argument 'cookies'
I want use cookies like you see in the code, but i can not, can anyone help me?
The feature was added on aiohttp GitHub master but not released yet.
Please either install aiohttp from GitHub or wait for a while for aiohttp 3.5 release.
I hope to publish it in a few days.
I have an Issue with asyncio I can't really get me head around.
take this working example (with Python 3.6+ because of string interpolation)
import asyncio
import aiohttp
import async_timeout
import json
async def fetch(session, url):
async with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def get_bittrex_marketsummary(currency_pair):
url = f'https://bittrex.com/api/v1.1/public/getmarketsummary?market={currency_pair}'
async with aiohttp.ClientSession() as session:
response = await fetch(session, url)
return json.loads(response)
class MyCryptoCurrency:
def __init__(self):
self.currency = "BTC-ETH"
self.last_price = None
asyncio.ensure_future(self.get_last_price())
async def get_last_price(self):
self.last_price = await get_bittrex_marketsummary(self.currency)
async def main():
eth = MyCryptoCurrency()
print(eth.last_price)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
while this runs and doesn't throw any Exceptions, it doesn't get the result from the api request and so ... doesn't work :P
If I try to use f.e. loop.run_until_complete(get_bittrex_marketsummary()) I get "event loop is already running" error - which kind of makes sense.
Any hints how to solve this properly?
Thx in advance!
ok, after talking about this in #python channel on freenode I got the answer "don't do async I/O in __init__", so here is the working version:
import asyncio
import aiohttp
import async_timeout
import json
async def fetch(session, url):
async with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def get_bittrex_marketsummary(currency_pair):
url = f'https://bittrex.com/api/v1.1/public/getmarketsummary?market={currency_pair}'
async with aiohttp.ClientSession() as session:
response = await fetch(session, url)
return json.loads(response)
class MyCryptoCurrency:
def __init__(self):
self.currency = "BTC-ETH"
self.last_price = None
async def get_last_price(self):
self.last_price = await get_bittrex_marketsummary(self.currency)
async def main():
eth = MyCryptoCurrency()
await eth.get_last_price()
print(eth.last_price)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
How can I load Zip file with GET request?
I use asyncio and aiohttp in my Python app.
That's my code:
async def fetch_page(session, url):
with aiohttp.Timeout(10):
async with session.get(url) as response:
assert response.status == 200
return await response.read()
loop = asyncio.get_event_loop()
links = ['http://www2.census.gov/census_2010/04-Summary_File_1/Louisiana/la2010.sf1.zip']
for link in links:
with aiohttp.ClientSession(loop=loop) as session:
response = loop.run_until_complete(fetch_page(session, url=link))
print(type(response))
And then I get asyncio.TimeoutError