Task top-up with asyncio - python-3.x

In my project, I have a list to tasks that I execute with.
loop.run_until_complete(tasks)
However, there is an infinite number of tasks, so at the moment, I execute them in batches. Essentially, I have this:
def get_results(tasks):
return [result for result in loop.run_until_complete(handle_tasks(tasks))]
while True:
tasks = get_tasks()
results = get_results(tasks)
I get a number of tasks, I lunch a regular function that uses a loop to perform these tasks asynchronously and returns the results.
This approach works, but I believe it can be improved.
Instead of doing batches of tasks, I would like to do some sort of task top-up.
Something like this:
while True:
if current_tasks < max_tasks:
new_tasks = get_tasks(max_tasks - current_tasks)
add_tasks(new_tasks)
current_tasks, results = stats_and_results()
I appreciate any ideas on how to approach this problem.
Thanks!

We had a similar problem and ended up writing a small "Pool" wrapper that takes jobs and run them with a predefined concurrency.
import asyncio
import sys
class Pool:
def __init__(self, concurrency):
self._sem = asyncio.BoundedSemaphore(concurrency)
self.jobs = []
async def __aenter__(self):
return self
async def __aexit__(self, *_):
if len(self.jobs) > 0:
await asyncio.wait(self.jobs)
def put(self, coro):
assert asyncio.iscoroutine(coro)
async def wrapped():
async with self._sem:
await coro
fut = asyncio.ensure_future(wrapped())
self.jobs.append(fut)
async def __aiter__(self):
return self
async def __anext__(self):
try:
coro = self.jobs.pop(0)
except IndexError:
raise StopAsyncIteration()
else:
return await coro
You can then use it this way:
async def main():
pool = Pool(10)
for task in get_tasks():
pool.put(task)
async for result in pool:
print('got', result)
This will schedule all the task, run at most 10 of them concurrently and return the results as they come to the main() coroutine

Related

How to pass async function to run_in_executor

I am trying to pass some_async_func function to run_in_executor.
In the following example I program executes with the following warning:
coroutine 'some_async_func' was never awaited
Would anyone of you know if it's possible to pass async function to run_in_executor?
I am trying to write a program which will run X async coroutines in Y number of threads.
import asyncio
import random
from threading import Thread, get_ident
async def some_async_func(num):
#
# Many other async functions is called here.
# I simplified this function just to execute 'sleep'.
#
ident = get_ident()
print(f"produce: {num} in thread {ident}", flush=True)
await asyncio.sleep(random.uniform(0, 0.5))
return {ident: num}
async def large_func():
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(None, some_async_func, 1)
def side_thread():
loop = asyncio.new_event_loop()
loop.run_until_complete(large_func())
loop.close()
t1 = Thread(target=side_thread, args=(), daemon=True)
t1.start()
t1.join()
Finally, I implemented a sync wrapper for async function and passed it as follows:
def async_func_wrapper():
loop = asyncio.new_event_loop()
results = loop.run_until_complete(some_async_func(10))
loop.close()
return results
[...]
with concurrent.futures.ProcessPoolExecutor() as pool:
result = await loop.run_in_executor(pool, async_func_wrapper)

Process tasks in batchs in asyncio

I have got a funcion that generates tasks (io bound tasks):
def get_task():
while True:
new_task = _get_task()
if new_task is not None:
yield new_task
else:
sleep(1)
And I am trying to write a consumer in asyncio that will be processing max 10 tasks at the time and one task is finished then will take new one.
I am not sure if I should use semaphores or is there any kind of asycio pool executor? I started to write a pseudocode with threads:
def run(self)
while True:
self.semaphore.acquire() # first acquire, then get task
t = get_task()
self.process_task(t)
def process_task(self, task):
try:
self.execute_task(task)
self.mark_as_done(task)
except:
self.mark_as_failed(task)
self.semaphore.release()
Could anyone help me? I have no clue where to put async/await keywords
Simple task cap using asyncio.Sepmaphore
async def max10(task_generator):
semaphore = asyncio.Semaphore(10)
async def bounded(task):
async with semaphore:
return await task
async for task in task_generator:
asyncio.ensure_future(bounded(task))
The problem with this solution is that tasks are being drawn from the generator greedily. For example, if generator reads from a large database, the program could run out of memory.
Other than that it's idiomatic and well-behaved.
A solution, that uses async generator protocol to pull new tasks on demand:
async def max10(task_generator):
tasks = set()
gen = task_generator.__aiter__()
try:
while True:
while len(tasks) < 10:
tasks.add(await gen.__anext__())
_done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
except StopAsyncIteration:
await asyncio.gather(*tasks)
It may be considered sub-optimal, because it doesn't start executing tasks until 10 are available.
And here's concise and magic solution using worker pattern:
async def max10(task_generator):
async def worker():
async for task in task_generator:
await task
await asyncio.gather(*[worker() for i in range(10)])
It relies on a somewhat counter-intuitive property of being able to have multiple async iterators over the same async generator, in which case each generated item is seen by only one iterator.
My gut tells me that none of these solutions behaves properly on cancellation.
Async isn't threads. If for example you have tasks that are file IO bound then write them async using aiofiles
async with aiofiles.open('filename', mode='r') as f:
contents = await f.read()
Then replace task with your tasks. If you want to only run 10 at a time await asyncio.gather every 10 tasks.
import asyncio
async def task(x):
await asyncio.sleep(0.5)
print( x, "is done" )
async def run(loop):
futs = []
for x in range(50):
futs.append( task(x) )
await asyncio.gather( *futs )
loop = asyncio.get_event_loop()
loop.run_until_complete( run(loop) )
loop.close()
If you can't write the tasks async and need threads this is a basic example using asyncio's ThreadPoolExecutor. Note that with max_workers=5 only 5 tasks are run at a time.
import time
from concurrent.futures import ThreadPoolExecutor
import asyncio
def blocking(x):
time.sleep(1)
print( x, "is done" )
async def run(loop):
futs = []
executor = ThreadPoolExecutor(max_workers=5)
for x in range(15):
future = loop.run_in_executor(executor, blocking, x)
futs.append( future )
await asyncio.sleep(4)
res = await asyncio.gather( *futs )
loop = asyncio.get_event_loop()
loop.run_until_complete( run(loop) )
loop.close()
As pointed out by Dima Tismek, using semaphores to limit concurrency is vulnerable to exhausting task_generator too eagerly, since there is no backpressure between obtaining the tasks and submitting them to the event loop. A better option, also explored by the other answer, is not to spawn a task as soon as the generator has produced an item, but to create a fixed number of workers that exhaust the generator concurrently.
There are two areas where the code could be improved:
there is no need for a semaphore - it is superfluous when the number of tasks is fixed to begin with;
handling cancellation of generated tasks and of the throttling task.
Here is an implementation that tackles both issues:
async def throttle(task_generator, max_tasks):
it = task_generator.__aiter__()
cancelled = False
async def worker():
async for task in it:
try:
await task
except asyncio.CancelledError:
# If a generated task is canceled, let its worker
# proceed with other tasks - except if it's the
# outer coroutine that is cancelling us.
if cancelled:
raise
# other exceptions are propagated to the caller
worker_tasks = [asyncio.create_task(worker())
for i in range(max_tasks)]
try:
await asyncio.gather(*worker_tasks)
except:
# In case of exception in one worker, or in case we're
# being cancelled, cancel all workers and propagate the
# exception.
cancelled = True
for t in worker_tasks:
t.cancel()
raise
A simple test case:
async def mock_task(num):
print('running', num)
await asyncio.sleep(random.uniform(1, 5))
print('done', num)
async def mock_gen():
tnum = 0
while True:
await asyncio.sleep(.1 * random.random())
print('generating', tnum)
yield asyncio.create_task(mock_task(tnum))
tnum += 1
if __name__ == '__main__':
asyncio.run(throttle(mock_gen(), 3))

Appending to merged async generators in Python

I'm trying to merge a bunch of asynchronous generators in Python 3.7 while still adding new async generators on iteration. I'm currently using aiostream to merge my generators:
from asyncio import sleep, run
from aiostream.stream import merge
async def go():
yield 0
await sleep(1)
yield 50
await sleep(1)
yield 100
async def main():
tasks = merge(go(), go(), go())
async for v in tasks:
print(v)
if __name__ == '__main__':
run(main())
However, I need to be able to continue to add to the running tasks once the loop has begun. Something like.
from asyncio import sleep, run
from aiostream.stream import merge
async def go():
yield 0
await sleep(1)
yield 50
await sleep(1)
yield 100
async def main():
tasks = merge(go(), go(), go())
async for v in tasks:
if v == 50:
tasks.merge(go())
print(v)
if __name__ == '__main__':
run(main())
The closest I've got to this is using the aiostream library but maybe this can also be written fairly neatly with just the native asyncio standard library.
Here is an implementation that should work efficiently even with a large number of async iterators:
class merge:
def __init__(self, *iterables):
self._iterables = list(iterables)
self._wakeup = asyncio.Event()
def _add_iters(self, next_futs, on_done):
for it in self._iterables:
it = it.__aiter__()
nfut = asyncio.ensure_future(it.__anext__())
nfut.add_done_callback(on_done)
next_futs[nfut] = it
del self._iterables[:]
return next_futs
async def __aiter__(self):
done = {}
next_futs = {}
def on_done(nfut):
done[nfut] = next_futs.pop(nfut)
self._wakeup.set()
self._add_iters(next_futs, on_done)
try:
while next_futs:
await self._wakeup.wait()
self._wakeup.clear()
for nfut, it in done.items():
try:
ret = nfut.result()
except StopAsyncIteration:
continue
self._iterables.append(it)
yield ret
done.clear()
if self._iterables:
self._add_iters(next_futs, on_done)
finally:
# if the generator exits with an exception, or if the caller stops
# iterating, make sure our callbacks are removed
for nfut in next_futs:
nfut.remove_done_callback(on_done)
def append_iter(self, new_iter):
self._iterables.append(new_iter)
self._wakeup.set()
The only change required for your sample code is that the method is named append_iter, not merge.
This can be done using stream.flatten with an asyncio queue to store the new generators.
import asyncio
from aiostream import stream, pipe
async def main():
queue = asyncio.Queue()
await queue.put(go())
await queue.put(go())
await queue.put(go())
xs = stream.call(queue.get)
ys = stream.cycle(xs)
zs = stream.flatten(ys, task_limit=5)
async with zs.stream() as streamer:
async for item in streamer:
if item == 50:
await queue.put(go())
print(item)
Notice that you may tune the number of tasks that can run at the same time using the task_limit argument. Also note that zs can be elegantly defined using the pipe syntax:
zs = stream.call(queue.get) | pipe.cycle() | pipe.flatten(task_limit=5)
Disclaimer: I am the project maintainer.

How to iterate over an asynchronous iterator with a timeout?

I think it's easier to understand in terms of code:
try:
async for item in timeout(something(), timeout=60):
await do_something_useful(item)
except asyncio.futures.TimeoutError:
await refresh()
I want the async for to run at most 60 seconds.
I needed to do something like this to create a websocket(also an async iterator) which times out if it doesn't get a message after a certain duration. I settled on the following:
socket_iter = socket.__aiter__()
try:
while True:
message = await asyncio.wait_for(
socket_iter.__anext__(),
timeout=10
)
except asyncio.futures.TimeoutError:
# streaming is completed
pass
AsyncTimedIterable could be the implementation of timeout() in your code:
class _AsyncTimedIterator:
__slots__ = ('_iterator', '_timeout', '_sentinel')
def __init__(self, iterable, timeout, sentinel):
self._iterator = iterable.__aiter__()
self._timeout = timeout
self._sentinel = sentinel
async def __anext__(self):
try:
return await asyncio.wait_for(self._iterator.__anext__(), self._timeout)
except asyncio.TimeoutError:
return self._sentinel
class AsyncTimedIterable:
__slots__ = ('_factory', )
def __init__(self, iterable, timeout=None, sentinel=None):
self._factory = lambda: _AsyncTimedIterator(iterable, timeout, sentinel)
def __aiter__(self):
return self._factory()
(original answer)
Or use this class to replace your timeout() function:
class AsyncTimedIterable:
def __init__(self, iterable, timeout=None, sentinel=None):
class AsyncTimedIterator:
def __init__(self):
self._iterator = iterable.__aiter__()
async def __anext__(self):
try:
return await asyncio.wait_for(self._iterator.__anext__(),
timeout)
except asyncio.TimeoutError:
return sentinel
self._factory = AsyncTimedIterator
def __aiter__(self):
return self._factory()
A simple approach is to use an asyncio.Queue, and separate the code into two coroutines:
queue = asyncio.Queue()
async for item in something():
await queue.put(item)
In another coroutine:
while True:
try:
item = await asyncio.wait_for(queue.get(), 60)
except asyncio.TimeoutError:
pass
else:
if item is None:
break # use None or whatever suits you to gracefully exit
await do_something_useful(item)
refresh()
Please note, it will make the queue grow if the handler do_something_useful() is slower than something() generates items. You may set a maxsize on the queue to limit the buffer size.
Answer to your question can be different based on nature of refresh function. If it's very short-running function it can be freely called inside coroutine. But if it's blocking function (due to network or CPU) it should be ran in executor to avoid freezing asyncio event loop.
Code below shows example for the first case, changing it to run refresh in executor is not hard.
Second thing to be clarified is a nature of asynchronous iterator. As far as I understand, you're using it to either get result from something or None if timeout occurred.
If I understand logic correctly, your code can be written clearer (similar to non-async style as asyncio is created to allow) using async_timeout context manager and without using asynchronous iterator at all:
import asyncio
from async_timeout import timeout
async def main():
while True:
try:
async with timeout(60):
res = await something()
await do_something_useful(item)
except asyncio.TimeoutError:
pass
finally:
refresh()
Your question is missing a couple of details, but assuming something() is an async iterator or generator and you want item to be sentinel everytime something has not yielded a value within the timeout, here is an implementation of timeout():
import asyncio
from typing import *
T = TypeVar('T')
# async generator, needs python 3.6
async def timeout(it: AsyncIterator[T], timeo: float, sentinel: T) -> AsyncGenerator[T, None]:
try:
nxt = asyncio.ensure_future(it.__anext__())
while True:
try:
yield await asyncio.wait_for(asyncio.shield(nxt), timeo)
nxt = asyncio.ensure_future(it.__anext__())
except asyncio.TimeoutError:
yield sentinel
except StopAsyncIteration:
pass
finally:
nxt.cancel() # in case we're getting cancelled our self
test:
async def something():
yield 1
await asyncio.sleep(1.1)
yield 2
await asyncio.sleep(2.1)
yield 3
async def test():
expect = [1, None, 2, None, None, 3]
async for item in timeout(something(), 1, None):
print("Check", item)
assert item == expect.pop(0)
asyncio.get_event_loop().run_until_complete(test())
When wait_for() times out it will cancel the task. Therefore, we need to wrap it.__anext__() in a task and then shield it, to be able to resume the iterator.
I want the coroutine to execute refresh at least every 60 seconds.
If you need to execute refresh every 60 seconds regardless of what happens with do_something_useful, you can arrange that with a separate coroutine:
import time
async def my_loop():
# ensure refresh() is invoked at least once in 60 seconds
done = False
async def repeat_refresh():
last_run = time.time()
while not done:
await refresh()
now = time.time()
await asyncio.sleep(max(60 - (now - last_run), 0))
last_run = now
# start repeat_refresh "in the background"
refresh_task = asyncio.get_event_loop().create_task(repeat_refresh())
try:
async for item in something():
if item is not None:
await do_something_useful(item)
await refresh()
finally:
done = True

How to exchange values betwwen async function which are running forever in python3.5?

I'm trying to learn python async module, and I have searched everywhere on the Internet including youtube pycon and various other videos, but i could not find a way to get variables from one async function (running forever) and to pass variable to other async function (running forever)
demo code:
async def one():
while True:
ltp += random.uniform(-1, 1)
return ltp
async def printer(ltp):
while True:
print(ltp)
As with any other Python code, the two coroutines can communicate using an object they both share, most typically self:
class Demo:
def __init__(self):
self.ltp = 0
async def one(self):
while True:
self.ltp += random.uniform(-1, 1)
await asyncio.sleep(0)
async def two(self):
while True:
print(self.ltp)
await asyncio.sleep(0)
loop = asyncio.get_event_loop()
d = Demo()
loop.create_task(d.one())
loop.create_task(d.two())
loop.run_forever()
The issue with the above code is that one() keeps producing values regardless of whether anyone is reading them. Also, there is no guarantee that two() is not running faster than one(), in which case it will see the same value more than once. The solution to both problems is to communicate via a bounded queue:
class Demo:
def __init__(self):
self.queue = asyncio.Queue(1)
async def one(self):
ltp = 0
while True:
ltp += random.uniform(-1, 1)
await self.queue.put(ltp)
async def two(self):
while True:
ltp = await self.queue.get()
print(ltp)
await asyncio.sleep(0)

Resources