Feed ProcessPoolExecutor with results from asyncio - python-3.x

I have a bunch of online data that I want to download and process efficiently. Downloading already takes some time but cpu-bound processing takes much longer. I struggle to implement a combination of async and ProcessPoolExecutor.
import asyncio
import time
import aiohttp
from aiohttp import ClientSession
from concurrent.futures import ProcessPoolExecutor
class WebData:
def __init__(self, url):
self.url = url
self.binary = b''
async def download(self, client):
time.sleep(0.2)
try:
async with client.get(self.url, timeout=5) as resp:
self.binary = await resp.read()
print(f'Downloaded {self.url}')
except (aiohttp.ClientConnectionError,
asyncio.exceptions.TimeoutError):
pass
return
def process(self):
print(f'Start processing {self.url}')
time.sleep(1)
print(f'Finished processing {self.url}')
async def main():
list_urls = [f'https://www.google.com/search?q={i}'
for i in range(10)]
list_obj = [WebData(url) for url in list_urls]
with ProcessPoolExecutor() as executor:
async with ClientSession() as session:
tasks = [obj.download(session) for obj in list_obj]
await asyncio.gather(*tasks)
list_futures = [
executor.submit(obj.process)
for obj in list_obj]
return list_futures
res = asyncio.run(main())
This works as expected but it fails to accomplish what I am looking for. It first downloads all data and starts processing it only afterwards, which leaves my cores idle during download. Is there any way I can pipe the downloaded objects to the executor while other objects are still downloading?
I found this thread but it isn't what I need.

You should submit the self.process inside after the coroutine ends. For that, you can have a separate asynchronous method that will await the download method and submit the process to ProcessPoolExecutor.
class WebData:
def __init__(self, url):
"""The code has not been changed"""
async def download(self, client):
"""The code has not been changed"""
def process(self):
"""The code has not been changed"""
async def execute(self, session, pool):
await self.download(session)
pool.submit(self.process)
async def main():
list_urls = [f'https://www.google.com/search?q={i}' for i in range(10)]
list_obj = [WebData(url) for url in list_urls]
with ProcessPoolExecutor() as pool:
async with ClientSession() as session:
list_futures = await asyncio.gather(*[obj.execute(session, pool) for obj in list_obj])
return list_futures

Related

Running coroutines in different thread with same event loop

I want to run a coroutine in a different thread and get the result that the coroutine returns.
class Main:
def __init__(self, result_from_io_task=None):
self._io_task_result = result_from_io_task
async def io_task(self):
await asyncio.sleep(2)
return "slept of 2s"
def non_async_func(self):
#This can't be made async.
if not self._io_task_result:
#run io_task and get its result
#event loop will be running in the main thread so I can fire the task
task = asyncio.create_task(self.io_task)
#can't await task since I am in non-async func and I cannot
#return from non_async_func until and unless I know what
#self.io_task has returned. Tried following but my app hangs forever.
while not task.done():
pass
I also tried, but it doesn't work "
def run_in_thread(coro, loop):
output = []
def run():
fut = asyncio.run_coroutine_threadsafe(coro, loop)
output.append(fut)
thr = Thread(target=run)
thr.start()
return output
async def main():
main_obj = Main(result_from_io_task=None)
v = main_obj.non_async_func()
How can I spawn a new thread and run the given coroutine using event loop running in main thread
Unfortunately, my codebase depends on python < 3.8 and asyncio.to_thread is not available in python 3.7
Based on the example of my answer, I'm introducing another implementation of the asynchronous decorator that does not use asyncio.to_thread() function but uses ThreadPoolExecutor instead.
import asyncio
import requests
import concurrent.futures
def asynchronous(func):
async def wrapper(*args, **kwargs):
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(func, *args, **kwargs)
return future.result()
return wrapper
#asynchronous
def request(url):
with requests.Session() as session:
response = session.get(url)
try:
return response.json()
except requests.JSONDecodeError:
return response.text
async def main():
task = asyncio.create_task(request("https://google.com/"))
print("waiting for response...")
result = await task
print(result)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()

Run slow background blocking task from asyncio loop

I have asyncio crawler, that visits URLs and collects new URLs from HTML responses. I was inspired that great tool: https://github.com/aio-libs/aiohttp/blob/master/examples/legacy/crawl.py
Here is a very simplified piece of workflow, how it works:
import asyncio
import aiohttp
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
What I need now is to add some very slow beautifulsoap based function. I need that function do not block main loop and work as background process. For instance, I will handle HTTP responses.
I read python docs about it and found that: https://docs.python.org/3/library/asyncio-eventloop.html#asyncio.loop.run_in_executor
I tried to add it to my code, but it does not work as should (I use cpu_bound only for demo):
import asyncio
import aiohttp
import concurrent.futures
def cpu_bound():
return sum(i * i for i in range(10 ** 7))
class Requester:
def __init__(self):
self.sem = asyncio.BoundedSemaphore(1)
async def fetch(self, url, client):
async with client.get(url) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
####### Blocking operation #######
loop = asyncio.get_running_loop()
with concurrent.futures.ProcessPoolExecutor() as pool:
result = await loop.run_in_executor(pool, cpu_bound)
print('custom process pool', result)
#################################
return response, data
async def run(self, urls):
async with aiohttp.ClientSession() as client:
for url in urls:
await self.sem.acquire()
task = asyncio.create_task(self.fetch(url, client))
task.add_done_callback(lambda t: self.sem.release())
def http_crawl(self, _urls_list):
loop = asyncio.get_event_loop()
crawl_loop = asyncio.ensure_future(self.run(_urls_list))
loop.run_until_complete(crawl_loop)
r = Requester()
_url_list = ['https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)
For now, it doesn't work as expected, it blocks HTTP requests every time:
URL: https://www.google.com have code: 200
custom process pool 333333283333335000000
URL: https://images.google.com have code: 200
custom process pool 333333283333335000000
URL: https://maps.google.com have code: 200
custom process pool 333333283333335000000
URL: https://mail.google.com have code: 200
custom process pool 333333283333335000000
URL: https://news.google.com have code: 200
custom process pool 333333283333335000000
URL: https://video.google.com have code: 200
custom process pool 333333283333335000000
How to correctly put the task in the background inside the main asyncio process?
Are there best practices on how to do that in a simple way, or I should use Redis for task planning?
I believe that since you are setting your BoundedSemaphore to 1 it is only allowing one instance of your task to run at a time.
You can use the ratelimiter package to limit the number of concurrent requests in a certain amount of time.
I would also upload code that works for me. It is two independent async queues, and one of them spawn high-CPU consumption process in a separate loop:
import asyncio
import functools
import aiohttp
import concurrent.futures
def cpu_bound(num):
return sum(i * i for i in range(10 ** num))
class Requester:
def __init__(self):
self.threads = 3
self.threads2 = 10
self.pool = concurrent.futures.ProcessPoolExecutor()
async def fetch(self, url):
try:
timeout = aiohttp.ClientTimeout(total=10)
async with self.client.get(url, allow_redirects=False, verify_ssl=False, timeout=timeout) as response:
data = (await response.read()).decode('utf-8', 'replace')
print("URL:", url, " have code:", response.status)
resp_list = {'url': str(response.real_url), 'data': str(data), 'headers': dict(response.headers)}
return resp_list
except Exception as err:
print(err)
return {}
async def heavy_worker(self, a):
while True:
resp_list = await a.get()
if resp_list.keys():
####### Blocking operation #######
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(self.pool, functools.partial(cpu_bound, num=5))
print('wappalazer', result)
except Exception as err:
print(err)
#################################
a.task_done()
else:
a.task_done()
async def fetch_worker(self, q, a):
while True:
url = await q.get()
resp_list = await self.fetch(url)
q.task_done()
await a.put(resp_list)
async def main(self, urls):
# Create an queues those we will use to store our "workload".
q = asyncio.Queue()
a = asyncio.Queue()
# Create workers tasks to process the queue concurrently.
workers_fetch = [asyncio.create_task(self.fetch_worker(q, a)) for _ in range(self.threads)]
workers_heavy = [asyncio.create_task(self.heavy_worker(a)) for _ in range(self.threads2)]
for url in urls:
await q.put(url)
# wait for all tasks to be processed
await q.join()
await a.join()
# Cancel our worker tasks.
for worker in workers_fetch:
worker.cancel()
await asyncio.gather(*workers_fetch , return_exceptions=True)
for worker in workers_heavy:
worker.cancel()
await asyncio.gather(*workers_heavy , return_exceptions=True)
async def run(self, _urls_list):
async with aiohttp.ClientSession() as self.client:
task_for_first_run = asyncio.create_task(self.main(_urls_list))
await asyncio.sleep(1)
await task_for_first_run
print("All tasks completed")
def http_crawl(self, _urls_list):
asyncio.run(self.run(_urls_list))
r = Requester()
_url_list = ['http://aaaaaaaaaaaaaaaa.aaaaaaaaaaaaaaaaaaa.aa', 'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com', 'https://www.google.com',
'https://images.google.com','https://maps.google.com','https://mail.google.com','https://news.google.com',
'https://video.google.com','https://books.google.com', 'https://www.google.com','https://images.google.com',
'https://maps.google.com','https://mail.google.com','https://news.google.com','https://video.google.com',
'https://books.google.com', 'https://www.google.com','https://images.google.com','https://maps.google.com',
'https://mail.google.com','https://news.google.com','https://video.google.com','https://books.google.com',
'https://www.google.com','https://images.google.com','https://maps.google.com','https://mail.google.com',
'https://news.google.com','https://video.google.com','https://books.google.com']
r.http_crawl(_url_list)

Python 3.7 Non-Blocking Request?

I'd like to do a non-blocking http request in Python 3.7. What I'm trying to do is described well in this SO post, but it doesn't yet have an accepted answer.
Here's my code so far:
import asyncio
from aiohttp import ClientSession
[.....]
async def call_endpoint_async(endpoint, data):
async with ClientSession() as session, session.post(url=endpoint, data=data) as result:
response = await result.read()
print(response)
return response
class CreateTestScores(APIView):
permission_classes = (IsAuthenticated,)
def post(self, request):
[.....]
asyncio.run(call_endpoint_async(url, data))
print('cp #1') # <== `async.io` BLOCKS -- PRINT STATEMENT DOESN'T RUN UNTIL `asyncio.run` RETURNS
What is the correct way to do an Ajax-style non-blocking http request in Python?
Asyncio makes it easy to make a non-blocking request if your program runs in asyncio. For example:
async def doit():
task = asyncio.create_task(call_endpoint_async(url, data))
print('cp #1')
await asyncio.sleep(1)
print('is it done?', task.done())
await task
print('now it is done')
But this requires that the "caller" be async as well. In your case you want the whole asyncio event loop to run in the background, so that. This can be achieved by running it in a separate thread, e.g.:
pool = concurrent.futures.ThreadPoolExecutor()
# ...
def post(self, request):
fut = pool.submit(asyncio.run, call_endpoint_async(url, data))
print('cp #1')
However, in that case you're not getting anything by using asyncio. Since you're using threads anyway, you could as well call a sync function such as requests.get() to begin with.

RuntimeError when running coroutine from __init__

Here's a sample code.
class Foo:
def __init__(self):
self._run_coro()
def _run_coro(self):
async def init():
bar = #some I/O op
self.bar = bar
loop = asyncio.get_event_loop()
loop.run_until_complete(init())
async def spam(self):
return await #I/O op
async def main():
foo = Foo()
await foo.spam()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
When I run this code, I get following exception:
RuntimeError: This event loop is already running
If I initialize Foo outside main, the code runs without any exception. I want to initialize Foo such that during initialization it runs a coroutine which creates a class attribute bar.
I am unable to figure how to do it correctly. How can I run a coroutine from __init__.
Any help would be highly appreciated.
class Foo:
def __init__(self):
self.session = requests.Session()
self.async_session = None
#I guess this can be done to initialize it.
s = self.init_async_session()
try:
s.send(None)
except StopIteration:
pass
finally:
s.close()
async def init_async_session(self):
#ClientSession should be created inside a coroutine.
self.async_session = aiohttp.ClientSession()
What would be the right way to initialize self.async_session
If some method uses something asynchronous it should be explicitly defined as asynchronous either. This is a core idea behind asyncio: make you write code a way you always know if some arbitrary method may do something asynchronous.
In your snippet you want to do async thing (bar I/O) inside sync method __init__ and asyncio prohibits it. You should make _run_coro async and initialize Foo asynchronously, for example, using __await__ method:
import asyncio
class Foo:
def __await__(self):
return self._run_coro().__await__()
async def _run_coro(self): # real async initializer
async def init():
await asyncio.sleep(1) # bar I/O
self.bar = 123
await init()
return self
async def spam(self):
return await asyncio.sleep(1) # I/O op
async def main():
foo = await Foo()
await foo.spam()
asyncio.run(main()) # instead of two lines in Python 3.7+
You may be interested in reading this answer to understand better how asyncio works and how to handle it.
Upd:
s = self.init_async_session()
try:
s.send(None)
Don't do such things: generator's method are only details of implementation in regard of coroutines. You can predict how coroutine will react on calling .send() method and you can rely on this behavior.
If you want to execute coroutine use await, if you want to start it "in background" use task or other functions from asyncio doc.
What would be the right way to initialize self.async_session
When it comes to aiohttp.ClientSession it should not only be created, but properly closed also. Best way to do it is to use async context manager as shown in aiohttp doc.
If you want to hide this operation inside Foo you can make it async manager either. Complete example:
import aiohttp
class Foo:
async def __aenter__(self):
self._session = aiohttp.ClientSession()
await self._session.__aenter__()
return self
async def __aexit__(self, *args):
await self._session.__aexit__(*args)
async def spam(self):
url = 'http://httpbin.org/delay/1'
resp = await self._session.get(url)
text = await resp.text()
print(text)
async def main():
async with Foo() as foo:
await foo.spam()
asyncio.run(main())
Upd2:
You can combine ways to init/close object from above to achive result you like. As long as you keep in mind both operations are asynchronous and thus should be awaited, everything should be fine.
One more possible way:
import asyncio
import aiohttp
class Foo:
def __await__(self):
return self._init().__await__()
async def _init(self):
self._session = aiohttp.ClientSession()
await self._session.__aenter__()
return self
async def close(self):
await self._session.__aexit__(None, None, None)
async def spam(self):
url = 'http://httpbin.org/delay/1'
resp = await self._session.get(url)
text = await resp.text()
print(text)
async def main():
foo = await Foo()
try:
await foo.spam()
finally:
await foo.close()
asyncio.run(main())
Here's my solution.
class Session:
def __init__(self, headers):
self._headers = headers
self._session = requests.Session()
self._async_session = None
async def _init(self):
self._session = aiohttp.ClientSession(headers=headers)
async def async_request(self, url):
while True:
try:
async with self._async_session.get(url) as resp:
resp.raise_for_status()
return await resp.text()
except aiohttp.client_exceptions.ClientError:
#retry or raise
except AttributeError:
if isinstance(self._async_session, aiohttp.ClientSession):
raise
await self._init()
def request(self, url):
return self._session.get(url).text
async def close(self):
if isinstance(self._async_session, aiohttp.ClientSession):
await self._session.close()
async def main():
session = Session({})
print(await session.async_request('https://httpstat.us/200')
await session.close()
asyncio.run(main())
I can initialize the Session class and make synchronous as well as asynchronous requests. I do not have to explicitly call await session._init() to initialize self._async_session as when session._async_request is called and self._async_session is None, then await session._init() will be called and the request will be retried.

Awaiting a coroutine inside of a Class while event_loop is already running

I have an Issue with asyncio I can't really get me head around.
take this working example (with Python 3.6+ because of string interpolation)
import asyncio
import aiohttp
import async_timeout
import json
async def fetch(session, url):
async with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def get_bittrex_marketsummary(currency_pair):
url = f'https://bittrex.com/api/v1.1/public/getmarketsummary?market={currency_pair}'
async with aiohttp.ClientSession() as session:
response = await fetch(session, url)
return json.loads(response)
class MyCryptoCurrency:
def __init__(self):
self.currency = "BTC-ETH"
self.last_price = None
asyncio.ensure_future(self.get_last_price())
async def get_last_price(self):
self.last_price = await get_bittrex_marketsummary(self.currency)
async def main():
eth = MyCryptoCurrency()
print(eth.last_price)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
while this runs and doesn't throw any Exceptions, it doesn't get the result from the api request and so ... doesn't work :P
If I try to use f.e. loop.run_until_complete(get_bittrex_marketsummary()) I get "event loop is already running" error - which kind of makes sense.
Any hints how to solve this properly?
Thx in advance!
ok, after talking about this in #python channel on freenode I got the answer "don't do async I/O in __init__", so here is the working version:
import asyncio
import aiohttp
import async_timeout
import json
async def fetch(session, url):
async with async_timeout.timeout(10):
async with session.get(url) as response:
return await response.text()
async def get_bittrex_marketsummary(currency_pair):
url = f'https://bittrex.com/api/v1.1/public/getmarketsummary?market={currency_pair}'
async with aiohttp.ClientSession() as session:
response = await fetch(session, url)
return json.loads(response)
class MyCryptoCurrency:
def __init__(self):
self.currency = "BTC-ETH"
self.last_price = None
async def get_last_price(self):
self.last_price = await get_bittrex_marketsummary(self.currency)
async def main():
eth = MyCryptoCurrency()
await eth.get_last_price()
print(eth.last_price)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Resources