Process tasks in batchs in asyncio - python-3.x

I have got a funcion that generates tasks (io bound tasks):
def get_task():
while True:
new_task = _get_task()
if new_task is not None:
yield new_task
else:
sleep(1)
And I am trying to write a consumer in asyncio that will be processing max 10 tasks at the time and one task is finished then will take new one.
I am not sure if I should use semaphores or is there any kind of asycio pool executor? I started to write a pseudocode with threads:
def run(self)
while True:
self.semaphore.acquire() # first acquire, then get task
t = get_task()
self.process_task(t)
def process_task(self, task):
try:
self.execute_task(task)
self.mark_as_done(task)
except:
self.mark_as_failed(task)
self.semaphore.release()
Could anyone help me? I have no clue where to put async/await keywords

Simple task cap using asyncio.Sepmaphore
async def max10(task_generator):
semaphore = asyncio.Semaphore(10)
async def bounded(task):
async with semaphore:
return await task
async for task in task_generator:
asyncio.ensure_future(bounded(task))
The problem with this solution is that tasks are being drawn from the generator greedily. For example, if generator reads from a large database, the program could run out of memory.
Other than that it's idiomatic and well-behaved.
A solution, that uses async generator protocol to pull new tasks on demand:
async def max10(task_generator):
tasks = set()
gen = task_generator.__aiter__()
try:
while True:
while len(tasks) < 10:
tasks.add(await gen.__anext__())
_done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
except StopAsyncIteration:
await asyncio.gather(*tasks)
It may be considered sub-optimal, because it doesn't start executing tasks until 10 are available.
And here's concise and magic solution using worker pattern:
async def max10(task_generator):
async def worker():
async for task in task_generator:
await task
await asyncio.gather(*[worker() for i in range(10)])
It relies on a somewhat counter-intuitive property of being able to have multiple async iterators over the same async generator, in which case each generated item is seen by only one iterator.
My gut tells me that none of these solutions behaves properly on cancellation.

Async isn't threads. If for example you have tasks that are file IO bound then write them async using aiofiles
async with aiofiles.open('filename', mode='r') as f:
contents = await f.read()
Then replace task with your tasks. If you want to only run 10 at a time await asyncio.gather every 10 tasks.
import asyncio
async def task(x):
await asyncio.sleep(0.5)
print( x, "is done" )
async def run(loop):
futs = []
for x in range(50):
futs.append( task(x) )
await asyncio.gather( *futs )
loop = asyncio.get_event_loop()
loop.run_until_complete( run(loop) )
loop.close()
If you can't write the tasks async and need threads this is a basic example using asyncio's ThreadPoolExecutor. Note that with max_workers=5 only 5 tasks are run at a time.
import time
from concurrent.futures import ThreadPoolExecutor
import asyncio
def blocking(x):
time.sleep(1)
print( x, "is done" )
async def run(loop):
futs = []
executor = ThreadPoolExecutor(max_workers=5)
for x in range(15):
future = loop.run_in_executor(executor, blocking, x)
futs.append( future )
await asyncio.sleep(4)
res = await asyncio.gather( *futs )
loop = asyncio.get_event_loop()
loop.run_until_complete( run(loop) )
loop.close()

As pointed out by Dima Tismek, using semaphores to limit concurrency is vulnerable to exhausting task_generator too eagerly, since there is no backpressure between obtaining the tasks and submitting them to the event loop. A better option, also explored by the other answer, is not to spawn a task as soon as the generator has produced an item, but to create a fixed number of workers that exhaust the generator concurrently.
There are two areas where the code could be improved:
there is no need for a semaphore - it is superfluous when the number of tasks is fixed to begin with;
handling cancellation of generated tasks and of the throttling task.
Here is an implementation that tackles both issues:
async def throttle(task_generator, max_tasks):
it = task_generator.__aiter__()
cancelled = False
async def worker():
async for task in it:
try:
await task
except asyncio.CancelledError:
# If a generated task is canceled, let its worker
# proceed with other tasks - except if it's the
# outer coroutine that is cancelling us.
if cancelled:
raise
# other exceptions are propagated to the caller
worker_tasks = [asyncio.create_task(worker())
for i in range(max_tasks)]
try:
await asyncio.gather(*worker_tasks)
except:
# In case of exception in one worker, or in case we're
# being cancelled, cancel all workers and propagate the
# exception.
cancelled = True
for t in worker_tasks:
t.cancel()
raise
A simple test case:
async def mock_task(num):
print('running', num)
await asyncio.sleep(random.uniform(1, 5))
print('done', num)
async def mock_gen():
tnum = 0
while True:
await asyncio.sleep(.1 * random.random())
print('generating', tnum)
yield asyncio.create_task(mock_task(tnum))
tnum += 1
if __name__ == '__main__':
asyncio.run(throttle(mock_gen(), 3))

Related

coroutine was never awaited, async multithreading

async def existance(s, name):
async with s.head(f"https://example.com/{name}") as r1:
if r1.status == 404:
print('wow i worked')
async def process(names):
with ThreadPoolExecutor(max_workers=3) as executor:
async with aiohttp.ClientSession() as s:
loop = asyncio.get_event_loop()
tasks = []
for name in names:
if len(name) >= 5 and len(name) < 16 and name.isalnum():
task = loop.run_in_executor(
executor,
existance,
*(s, name)
)
tasks.append(task)
return await asyncio.gather(*tasks)
while True:
start_time = time.time()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(process(names))
loop.run_until_complete(future)
I'm using the code above to try and split my tasks created across multiple threads while checking them all asynchronously.
I'm getting this error:
RuntimeWarning: coroutine 'existance' was never awaited
future = asyncio.ensure_future(process(names))
I'm still somewhat of a python beginner and I can't really figure out what I should change here to get the result I want.
Any help is appreciated and I'm sorry if this is a duplicate question.
Since existance is async, you don't need threads at all, so you can implement process like this:
async def process(names):
async with aiohttp.ClientSession() as s:
tasks = []
for name in names:
if len(name) >= 5 and len(name) < 16 and name.isalnum():
tasks.append(existance(s, name))
return await asyncio.gather(*tasks)
If existance is calling some synchronous slow function which might block the event loop (such as BeautifulSoup parsing), you can fix that by running just that function through a thread pool. Introduce run_in_executor locally like this:
# instead of:
# result = slow_calculation(arg1, arg2) # sync/slow code
# use this:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, slow_calculation, arg1, arg2)
But don't attempt to call async functions through run_in_executor, that won't work.

How to correctly use async-await with thread pool in Python 3

I want to achieve same effect as
# Code 1
from multiprocessing.pool import ThreadPool as Pool
from time import sleep, time
def square(a):
print('start', a)
sleep(a)
print('end', a)
return a * a
def main():
p = Pool(2)
queue = list(range(4))
start = time()
results = p.map(square, queue)
print(results)
print(time() - start)
if __name__ == "__main__":
main()
with async functions like
# Code 2
from multiprocessing.pool import ThreadPool as Pool
from time import sleep, time
import asyncio
async def square(a):
print('start', a)
sleep(a) # await asyncio.sleep same effect
print('end', a)
return a * a
async def main():
p = Pool(2)
queue = list(range(4))
start = time()
results = p.map_async(square, queue)
results = results.get()
results = [await result for result in results]
print(results)
print(time() - start)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
Currently Code 1 takes 4 seconds and Code 2 takes 6 seconds which means it is not running in parallel. What is the correct and cleanest way to run multiple async functions in parallel?
Better to be python 3.6 compatible. Thank you!
map_async() is not the same "async" as in async def - if it is fed with an async def method, it won't actually run it but return a coroutine instance immediately (try calling such a method without await). Then you awaited on the 4 coroutines one by one, that equals to sequential execution, and ended up with 6 seconds.
Please see following example:
from time import time
import asyncio
from asyncio.locks import Semaphore
semaphore = Semaphore(2)
async def square(a):
async with semaphore:
print('start', a)
await asyncio.sleep(a)
print('end', a)
return a * a
async def main():
start = time()
tasks = []
for a in range(4):
tasks.append(asyncio.ensure_future(square(a)))
await asyncio.wait(tasks)
print([t.result() for t in tasks])
print(time() - start)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.close()
The Semaphore acts similarly like the ThreadPool - it allows only 2 concurrent coroutines entering the async with semaphore: block.

How to iterate over an asynchronous iterator with a timeout?

I think it's easier to understand in terms of code:
try:
async for item in timeout(something(), timeout=60):
await do_something_useful(item)
except asyncio.futures.TimeoutError:
await refresh()
I want the async for to run at most 60 seconds.
I needed to do something like this to create a websocket(also an async iterator) which times out if it doesn't get a message after a certain duration. I settled on the following:
socket_iter = socket.__aiter__()
try:
while True:
message = await asyncio.wait_for(
socket_iter.__anext__(),
timeout=10
)
except asyncio.futures.TimeoutError:
# streaming is completed
pass
AsyncTimedIterable could be the implementation of timeout() in your code:
class _AsyncTimedIterator:
__slots__ = ('_iterator', '_timeout', '_sentinel')
def __init__(self, iterable, timeout, sentinel):
self._iterator = iterable.__aiter__()
self._timeout = timeout
self._sentinel = sentinel
async def __anext__(self):
try:
return await asyncio.wait_for(self._iterator.__anext__(), self._timeout)
except asyncio.TimeoutError:
return self._sentinel
class AsyncTimedIterable:
__slots__ = ('_factory', )
def __init__(self, iterable, timeout=None, sentinel=None):
self._factory = lambda: _AsyncTimedIterator(iterable, timeout, sentinel)
def __aiter__(self):
return self._factory()
(original answer)
Or use this class to replace your timeout() function:
class AsyncTimedIterable:
def __init__(self, iterable, timeout=None, sentinel=None):
class AsyncTimedIterator:
def __init__(self):
self._iterator = iterable.__aiter__()
async def __anext__(self):
try:
return await asyncio.wait_for(self._iterator.__anext__(),
timeout)
except asyncio.TimeoutError:
return sentinel
self._factory = AsyncTimedIterator
def __aiter__(self):
return self._factory()
A simple approach is to use an asyncio.Queue, and separate the code into two coroutines:
queue = asyncio.Queue()
async for item in something():
await queue.put(item)
In another coroutine:
while True:
try:
item = await asyncio.wait_for(queue.get(), 60)
except asyncio.TimeoutError:
pass
else:
if item is None:
break # use None or whatever suits you to gracefully exit
await do_something_useful(item)
refresh()
Please note, it will make the queue grow if the handler do_something_useful() is slower than something() generates items. You may set a maxsize on the queue to limit the buffer size.
Answer to your question can be different based on nature of refresh function. If it's very short-running function it can be freely called inside coroutine. But if it's blocking function (due to network or CPU) it should be ran in executor to avoid freezing asyncio event loop.
Code below shows example for the first case, changing it to run refresh in executor is not hard.
Second thing to be clarified is a nature of asynchronous iterator. As far as I understand, you're using it to either get result from something or None if timeout occurred.
If I understand logic correctly, your code can be written clearer (similar to non-async style as asyncio is created to allow) using async_timeout context manager and without using asynchronous iterator at all:
import asyncio
from async_timeout import timeout
async def main():
while True:
try:
async with timeout(60):
res = await something()
await do_something_useful(item)
except asyncio.TimeoutError:
pass
finally:
refresh()
Your question is missing a couple of details, but assuming something() is an async iterator or generator and you want item to be sentinel everytime something has not yielded a value within the timeout, here is an implementation of timeout():
import asyncio
from typing import *
T = TypeVar('T')
# async generator, needs python 3.6
async def timeout(it: AsyncIterator[T], timeo: float, sentinel: T) -> AsyncGenerator[T, None]:
try:
nxt = asyncio.ensure_future(it.__anext__())
while True:
try:
yield await asyncio.wait_for(asyncio.shield(nxt), timeo)
nxt = asyncio.ensure_future(it.__anext__())
except asyncio.TimeoutError:
yield sentinel
except StopAsyncIteration:
pass
finally:
nxt.cancel() # in case we're getting cancelled our self
test:
async def something():
yield 1
await asyncio.sleep(1.1)
yield 2
await asyncio.sleep(2.1)
yield 3
async def test():
expect = [1, None, 2, None, None, 3]
async for item in timeout(something(), 1, None):
print("Check", item)
assert item == expect.pop(0)
asyncio.get_event_loop().run_until_complete(test())
When wait_for() times out it will cancel the task. Therefore, we need to wrap it.__anext__() in a task and then shield it, to be able to resume the iterator.
I want the coroutine to execute refresh at least every 60 seconds.
If you need to execute refresh every 60 seconds regardless of what happens with do_something_useful, you can arrange that with a separate coroutine:
import time
async def my_loop():
# ensure refresh() is invoked at least once in 60 seconds
done = False
async def repeat_refresh():
last_run = time.time()
while not done:
await refresh()
now = time.time()
await asyncio.sleep(max(60 - (now - last_run), 0))
last_run = now
# start repeat_refresh "in the background"
refresh_task = asyncio.get_event_loop().create_task(repeat_refresh())
try:
async for item in something():
if item is not None:
await do_something_useful(item)
await refresh()
finally:
done = True

Task top-up with asyncio

In my project, I have a list to tasks that I execute with.
loop.run_until_complete(tasks)
However, there is an infinite number of tasks, so at the moment, I execute them in batches. Essentially, I have this:
def get_results(tasks):
return [result for result in loop.run_until_complete(handle_tasks(tasks))]
while True:
tasks = get_tasks()
results = get_results(tasks)
I get a number of tasks, I lunch a regular function that uses a loop to perform these tasks asynchronously and returns the results.
This approach works, but I believe it can be improved.
Instead of doing batches of tasks, I would like to do some sort of task top-up.
Something like this:
while True:
if current_tasks < max_tasks:
new_tasks = get_tasks(max_tasks - current_tasks)
add_tasks(new_tasks)
current_tasks, results = stats_and_results()
I appreciate any ideas on how to approach this problem.
Thanks!
We had a similar problem and ended up writing a small "Pool" wrapper that takes jobs and run them with a predefined concurrency.
import asyncio
import sys
class Pool:
def __init__(self, concurrency):
self._sem = asyncio.BoundedSemaphore(concurrency)
self.jobs = []
async def __aenter__(self):
return self
async def __aexit__(self, *_):
if len(self.jobs) > 0:
await asyncio.wait(self.jobs)
def put(self, coro):
assert asyncio.iscoroutine(coro)
async def wrapped():
async with self._sem:
await coro
fut = asyncio.ensure_future(wrapped())
self.jobs.append(fut)
async def __aiter__(self):
return self
async def __anext__(self):
try:
coro = self.jobs.pop(0)
except IndexError:
raise StopAsyncIteration()
else:
return await coro
You can then use it this way:
async def main():
pool = Pool(10)
for task in get_tasks():
pool.put(task)
async for result in pool:
print('got', result)
This will schedule all the task, run at most 10 of them concurrently and return the results as they come to the main() coroutine

Asyncio worker that handles N jobs at a time?

I'm trying to make an asyncio worker class that will consume jobs from a job queue and process up to N jobs in parallel. Some jobs may queue additional jobs. When the job queue is empty and the worker finishes all of its current jobs, it should end.
I'm still struggling with asyncio conceptually. Here is one of my attempts, where N=3:
import asyncio, logging, random
async def do_work(id_):
await asyncio.sleep(random.random())
return id_
class JobQueue:
''' Maintains a list of all pendings jobs. '''
def __init__(self):
self._queue = asyncio.Queue()
self._max_id = 10
for id_ in range(self._max_id):
self._queue.put_nowait(id_ + 1)
def add_job(self):
self._max_id += 1
self._queue.put_nowait(self._max_id)
async def get_job(self):
return await self._queue.get()
def has_jobs(self):
return self._queue.qsize() > 0
class JobWorker:
''' Processes up to 3 jobs at a time in parallel. '''
def __init__(self, job_queue):
self._current_jobs = set()
self._job_queue = job_queue
self._semaphore = asyncio.Semaphore(3)
async def run(self):
while self._job_queue.has_jobs() or len(self._current_jobs) > 0:
print('Acquiring semaphore...')
await self._semaphore.acquire()
print('Getting a job...')
job_id = await self._job_queue.get_job()
print('Scheduling job {}'.format(job_id))
self._current_jobs.add(job_id)
task = asyncio.Task(do_work(job_id))
task.add_done_callback(self.task_finished)
def task_finished(self, task):
job_id = task.result()
print('Finished job {} / released semaphore'.format(job_id))
self._current_jobs.remove(job_id)
self._semaphore.release()
if random.random() < 0.2:
print('Queuing a new job')
self._job_queue.add_job()
loop = asyncio.get_event_loop()
jw = JobWorker(JobQueue())
print('Starting event loop')
loop.run_until_complete(jw.run())
print('Event loop ended')
loop.close()
An excerpt of the output:
Starting event loop
Acquiring semaphore...
Getting a job...
Scheduling job 1
Acquiring semaphore...
Getting a job...
Scheduling job 2
Acquiring semaphore...
Getting a job...
Scheduling job 3
Acquiring semaphore...
Finished job 2 / released semaphore
Getting a job...
Scheduling job 4
...snip...
Acquiring semaphore...
Finished job 11 / released semaphore
Getting a job...
Finished job 12 / released semaphore
Finished job 13 / released semaphore
It appears to correctly process all jobs while processing no more than 3 jobs at any one time. However, the program hangs after the last job is finished. As indicated by the output, it appears to be hanging at job_id = await self._job_queue.get_job(). Once the job queue is empty, this coroutine will never resume, and the check to see if the job queue is empty (at the top of the loop) isn't reached again.
I've tried working around this in a number of ways but conceptually something just don't quite fit. My current WIP is passing some futures between the queue and the worker and then using some combination of asyncio.wait(...) on all of them, but it's getting ugly and I'm wondering if there is an elegant solution that I'm overlooking.
You could take advantage of queue.task_done that indicates that a formerly enqueued task is complete. Then you can combine queue.join and queue.get using asyncio.wait: if queue.join finishes and queue.get doesn't, this means all the jobs have been completed.
See this example:
class Worker:
def __init__(self, func, n=3):
self.func = func
self.queue = asyncio.Queue()
self.semaphore = asyncio.Semaphore(n)
def put(self, *args):
self.queue.put_nowait(args)
async def run(self):
while True:
args = await self._get()
if args is None:
return
asyncio.ensure_future(self._target(args))
async def _get(self):
get_task = asyncio.ensure_future(self.queue.get())
join_task = asyncio.ensure_future(self.queue.join())
await asyncio.wait([get_task, join_task], return_when='FIRST_COMPLETED')
if get_task.done():
return task.result()
async def _target(self, args):
try:
async with self.semaphore:
return await self.func(*args)
finally:
self.queue.task_done()
You can timeout get_job with simple asyncio.wait_for. For example with 1s, and get back to the beginning of loop on timeout.
async def run(self):
while self._job_queue.has_jobs() or len(self._current_jobs) > 0:
print('Acquiring semaphore...')
await self._semaphore.acquire()
print('Getting a job...')
try:
job_id = await asyncio.wait_for(self._job_queue.get_job(), 1)
except asyncio.TimeoutError:
continue
print('Scheduling job {}'.format(job_id))
self._current_jobs.add(job_id)
task = asyncio.Task(do_work(job_id))
task.add_done_callback(self.task_finished)

Resources