Chunks size or timeout asyncio python - python-3.x

I'm trying to figure out a simple way to unstack a queue by getting chunks of it specifying a chunk_size and a timeout.
For instance, I want the get_chunks function to return either a list of chunk_size items if it takes less than timeout to get them otherwise a list of length between 9 and chunk_size.
Here is the code so far:
import asyncio
async def populate(queue):
for i in range(0, 100):
await queue.put(i)
async def _get_chunks(queue, chunk_size):
items = []
for i in range(0, chunk_size):
items.append(await queue.get())
await asyncio.sleep(0.2)
return items
async def get_chunks(queue, chunk_size, timeout):
while True:
yield _get_chunks(queue, chunk_size)
async def listen():
queue = asyncio.Queue()
await populate(queue)
print(f'{queue.qsize()} items in queue')
async for chunk in get_chunks(queue, 10, 1):
print(await chunk)
def main():
loop = asyncio.get_event_loop()
loop.run_until_complete(listen())
if __name__ == '__main__':
main()
I think there is a way to do it using asyncio.wait such that:
done, not_done = asyncio.wait([_get_chunks(queue, size),
asyncio.sleep(timeout)],
return_when=asyncio.FIRST_COMPLETE)
items = done.pop().result()
but i cannot manage to get the result when asyncio.sleep returns first.

You cannot get the result because _get_chunks is not yet finished. A simple workaround is to have some shared state between _get_chunks and its caller:
async def _get_chunks(queue, chunk_size, out):
for i in range(0, chunk_size):
out.append(await queue.get())
await asyncio.sleep(0.2)
Then you can implement the timeout using wait_for, which will automatically cancel the timed-out coroutine:
items = []
try:
asyncio.wait_for(_get_chunk(queue, size, items))
except asyncio.TimeoutError:
pass
# items now contains the elements _get_chunk managed to extract
# from the queue within the alotted time

Related

how to make asynchronous API calls nested with ensure_future and gather in Python Asyncio?

Im trying to make asynchronous API calls nested with asyncio using ensure_future() and gather().
I have tried two methods of getting this to work.
First of all the API is written with aiohttp and works fine.
I have tried it with two methods (here named get_rows_working() and get_rows_not_working()). One is working and one is not.
A single row always does its API calls in parallel to increase speed.
Now what I'm trying to do is getting all rows pulled in parallel aswell.
async def get_single_row(api):
tasks = []
tasks.append(asyncio.ensure_future(api.get_some_data())
tasks.append(asyncio.ensure_future(api.get_some_data2())
resp = await asyncio.gather(*tasks)
data = resp[0]
data2 = resp[1]
extra_data = data['some_key']
extra_data2 = data2['some_key2']
return (extra_data, extra_data2)
async def get_rows_working(rows):
data = []
for r in rows:
api = API(r)
data.append(await get_single_row(api))
return data
async def get_rows_not_working(rows):
tasks = []
for r in rows:
api = API(r)
tasks.append(asyncio.ensure_future(get_single_row(api)))
data = await asyncio.gather(*tasks)
return data
loop = asyncio.get_event_loop()
loop.run_until_complete(get_rows_working())
loop.run_until_complete(get_rows_not_working())
What happens if you start nesting these?
Because im starting to get KeyErrors on these lines (which I dont have with get_rows_working()):
extra_data = data['some_key']
extra_data2 = data2['some_key2']
Which makes me believe the internal order of operations gets all wonky because of nesting these.
Im not sure how to describe it better, sorry.
Is this even to correct way to achieve this?
Thanks for any answers.
I don't think the KeyError exceptions are related to the way your code is structured.
In order to reproduce your results, I mocked your API calls using asyncio.sleep():
import asyncio
class API:
def __init__(self, r):
self.r = r
async def get_some_data(self, i):
await asyncio.sleep(3)
return {'key_{}'.format(i) : 'Data_{}___Row_{}'.format(i, self.r)}
async def get_single_row(api):
tasks = []
tasks.append(asyncio.ensure_future(api.get_some_data(0)))
tasks.append(asyncio.ensure_future(api.get_some_data(1)))
resp = await asyncio.gather(*tasks)
data_0 = resp[0]
data_1 = resp[1]
extra_data_0 = data_0['key_0']
extra_data_1 = data_1['key_1']
return (extra_data_0, extra_data_1)
async def get_rows_working(rows):
data = []
for r in rows:
api = API(r)
data.append(await get_single_row(api))
return data
async def get_rows_not_working(rows):
tasks = []
for r in rows:
api = API(r)
tasks.append(asyncio.ensure_future(get_single_row(api)))
data = await asyncio.gather(*tasks)
return data
Then added a timer, and run both functions to understand what happens:
import time
class Timer:
def __enter__(self):
self.start = time.perf_counter()
return self
def __exit__(self, *args):
self.end = time.perf_counter()
self.interval = self.end - self.start
loop = asyncio.get_event_loop()
rows = range(10)
with Timer() as t:
res = loop.run_until_complete(get_rows_working(rows))
print("get_rows_working() result : {}".format(res))
print('get_rows_working() API call took %.03f sec.\n' % t.interval)
with Timer() as t:
res = loop.run_until_complete(get_rows_not_working(rows))
print("get_rows_not_working() result : {}".format(res))
print('get_rows_not_working() API call took %.03f sec.' % t.interval)
Output:
get_rows_working() result : [('Data_0___Row_0', 'Data_1___Row_0'), ('Data_0___Row_1', 'Data_1___Row_1'), ('Data_0___Row_2', 'Data_1___Row_2'), ('Data_0___Row_3', 'Data_1___Row_3'), ('Data_0___Row_4', 'Data_1___Row_4'), ('Data_0___Row_5', 'Data_1___Row_5'), ('Data_0___Row_6', 'Data_1___Row_6'), ('Data_0___Row_7', 'Data_1___Row_7'), ('Data_0___Row_8', 'Data_1___Row_8'), ('Data_0___Row_9', 'Data_1___Row_9')]
get_rows_working() API call took 30.034 sec.
get_rows_not_working() result : [('Data_0___Row_0', 'Data_1___Row_0'), ('Data_0___Row_1', 'Data_1___Row_1'), ('Data_0___Row_2', 'Data_1___Row_2'), ('Data_0___Row_3', 'Data_1___Row_3'), ('Data_0___Row_4', 'Data_1___Row_4'), ('Data_0___Row_5', 'Data_1___Row_5'), ('Data_0___Row_6', 'Data_1___Row_6'), ('Data_0___Row_7', 'Data_1___Row_7'), ('Data_0___Row_8', 'Data_1___Row_8'), ('Data_0___Row_9', 'Data_1___Row_9')]
get_rows_not_working() API call took 3.008 sec.
Which means that the second function get_rows_not_working() actually works as expected and calls the API concurrently.
Is it possible that you are getting KeyError exceptions because the API returns empty data when you exceed the request rate limit? For example, if the API is implemented as:
MAX_CONCUR_ROWS = 5
class API:
connections = 0
def __init__(self, r):
self.r = r
async def get_some_data(self, i):
API.connections += 1
await asyncio.sleep(3)
if API.connections > MAX_CONCUR_ROWS * 2:
res = {}
else:
res = {'key_{}'.format(i) : 'Data_{}___Row_{}'.format(i, self.r)}
API.connections -= 1
return res
Then get_rows_not_working() would return KeyError: 'key_0', while get_rows_working() works fine.
If that's the case, then you would want to throttle your requests, by batching them, or using asyncio.Semaphore :
async def get_single_row(api, semaphore):
# using tasks instead of coroutines won't work because asyncio.ensure_future() starts the coroutine, so the semaphore won't have any effect.
coros = []
coros.append(api.get_some_data(0))
coros.append(api.get_some_data(1))
async with semaphore:
resp = await asyncio.gather(*coros)
data_0 = resp[0]
data_1 = resp[1]
extra_data_0 = data_0['key_0']
extra_data_1 = data_1['key_1']
return (extra_data_0, extra_data_1)
async def get_rows_not_working(rows):
semaphore = asyncio.Semaphore(MAX_CONCUR_ROWS)
tasks = []
for r in rows:
api = API(r)
tasks.append(asyncio.ensure_future(get_single_row_coros(api, semaphore)))
data = await asyncio.gather(*tasks)
return data
The above code doesn't perform more than 5 concurrent calls at a time and returns the expected output (notice the it takes 6 sec. now instead of 3):
get_rows_not_working() result : [('Data_0___Row_0', 'Data_1___Row_0'), ('Data_0___Row_1', 'Data_1___Row_1'), ('Data_0___Row_2', 'Data_1___Row_2'), ('Data_0___Row_3', 'Data_1___Row_3'), ('Data_0___Row_4', 'Data_1___Row_4'), ('Data_0___Row_5', 'Data_1___Row_5'), ('Data_0___Row_6', 'Data_1___Row_6'), ('Data_0___Row_7', 'Data_1___Row_7'), ('Data_0___Row_8', 'Data_1___Row_8'), ('Data_0___Row_9', 'Data_1___Row_9')]
get_rows_not_working() API call took 6.013 sec.

Appending to merged async generators in Python

I'm trying to merge a bunch of asynchronous generators in Python 3.7 while still adding new async generators on iteration. I'm currently using aiostream to merge my generators:
from asyncio import sleep, run
from aiostream.stream import merge
async def go():
yield 0
await sleep(1)
yield 50
await sleep(1)
yield 100
async def main():
tasks = merge(go(), go(), go())
async for v in tasks:
print(v)
if __name__ == '__main__':
run(main())
However, I need to be able to continue to add to the running tasks once the loop has begun. Something like.
from asyncio import sleep, run
from aiostream.stream import merge
async def go():
yield 0
await sleep(1)
yield 50
await sleep(1)
yield 100
async def main():
tasks = merge(go(), go(), go())
async for v in tasks:
if v == 50:
tasks.merge(go())
print(v)
if __name__ == '__main__':
run(main())
The closest I've got to this is using the aiostream library but maybe this can also be written fairly neatly with just the native asyncio standard library.
Here is an implementation that should work efficiently even with a large number of async iterators:
class merge:
def __init__(self, *iterables):
self._iterables = list(iterables)
self._wakeup = asyncio.Event()
def _add_iters(self, next_futs, on_done):
for it in self._iterables:
it = it.__aiter__()
nfut = asyncio.ensure_future(it.__anext__())
nfut.add_done_callback(on_done)
next_futs[nfut] = it
del self._iterables[:]
return next_futs
async def __aiter__(self):
done = {}
next_futs = {}
def on_done(nfut):
done[nfut] = next_futs.pop(nfut)
self._wakeup.set()
self._add_iters(next_futs, on_done)
try:
while next_futs:
await self._wakeup.wait()
self._wakeup.clear()
for nfut, it in done.items():
try:
ret = nfut.result()
except StopAsyncIteration:
continue
self._iterables.append(it)
yield ret
done.clear()
if self._iterables:
self._add_iters(next_futs, on_done)
finally:
# if the generator exits with an exception, or if the caller stops
# iterating, make sure our callbacks are removed
for nfut in next_futs:
nfut.remove_done_callback(on_done)
def append_iter(self, new_iter):
self._iterables.append(new_iter)
self._wakeup.set()
The only change required for your sample code is that the method is named append_iter, not merge.
This can be done using stream.flatten with an asyncio queue to store the new generators.
import asyncio
from aiostream import stream, pipe
async def main():
queue = asyncio.Queue()
await queue.put(go())
await queue.put(go())
await queue.put(go())
xs = stream.call(queue.get)
ys = stream.cycle(xs)
zs = stream.flatten(ys, task_limit=5)
async with zs.stream() as streamer:
async for item in streamer:
if item == 50:
await queue.put(go())
print(item)
Notice that you may tune the number of tasks that can run at the same time using the task_limit argument. Also note that zs can be elegantly defined using the pipe syntax:
zs = stream.call(queue.get) | pipe.cycle() | pipe.flatten(task_limit=5)
Disclaimer: I am the project maintainer.

How to iterate over an asynchronous iterator with a timeout?

I think it's easier to understand in terms of code:
try:
async for item in timeout(something(), timeout=60):
await do_something_useful(item)
except asyncio.futures.TimeoutError:
await refresh()
I want the async for to run at most 60 seconds.
I needed to do something like this to create a websocket(also an async iterator) which times out if it doesn't get a message after a certain duration. I settled on the following:
socket_iter = socket.__aiter__()
try:
while True:
message = await asyncio.wait_for(
socket_iter.__anext__(),
timeout=10
)
except asyncio.futures.TimeoutError:
# streaming is completed
pass
AsyncTimedIterable could be the implementation of timeout() in your code:
class _AsyncTimedIterator:
__slots__ = ('_iterator', '_timeout', '_sentinel')
def __init__(self, iterable, timeout, sentinel):
self._iterator = iterable.__aiter__()
self._timeout = timeout
self._sentinel = sentinel
async def __anext__(self):
try:
return await asyncio.wait_for(self._iterator.__anext__(), self._timeout)
except asyncio.TimeoutError:
return self._sentinel
class AsyncTimedIterable:
__slots__ = ('_factory', )
def __init__(self, iterable, timeout=None, sentinel=None):
self._factory = lambda: _AsyncTimedIterator(iterable, timeout, sentinel)
def __aiter__(self):
return self._factory()
(original answer)
Or use this class to replace your timeout() function:
class AsyncTimedIterable:
def __init__(self, iterable, timeout=None, sentinel=None):
class AsyncTimedIterator:
def __init__(self):
self._iterator = iterable.__aiter__()
async def __anext__(self):
try:
return await asyncio.wait_for(self._iterator.__anext__(),
timeout)
except asyncio.TimeoutError:
return sentinel
self._factory = AsyncTimedIterator
def __aiter__(self):
return self._factory()
A simple approach is to use an asyncio.Queue, and separate the code into two coroutines:
queue = asyncio.Queue()
async for item in something():
await queue.put(item)
In another coroutine:
while True:
try:
item = await asyncio.wait_for(queue.get(), 60)
except asyncio.TimeoutError:
pass
else:
if item is None:
break # use None or whatever suits you to gracefully exit
await do_something_useful(item)
refresh()
Please note, it will make the queue grow if the handler do_something_useful() is slower than something() generates items. You may set a maxsize on the queue to limit the buffer size.
Answer to your question can be different based on nature of refresh function. If it's very short-running function it can be freely called inside coroutine. But if it's blocking function (due to network or CPU) it should be ran in executor to avoid freezing asyncio event loop.
Code below shows example for the first case, changing it to run refresh in executor is not hard.
Second thing to be clarified is a nature of asynchronous iterator. As far as I understand, you're using it to either get result from something or None if timeout occurred.
If I understand logic correctly, your code can be written clearer (similar to non-async style as asyncio is created to allow) using async_timeout context manager and without using asynchronous iterator at all:
import asyncio
from async_timeout import timeout
async def main():
while True:
try:
async with timeout(60):
res = await something()
await do_something_useful(item)
except asyncio.TimeoutError:
pass
finally:
refresh()
Your question is missing a couple of details, but assuming something() is an async iterator or generator and you want item to be sentinel everytime something has not yielded a value within the timeout, here is an implementation of timeout():
import asyncio
from typing import *
T = TypeVar('T')
# async generator, needs python 3.6
async def timeout(it: AsyncIterator[T], timeo: float, sentinel: T) -> AsyncGenerator[T, None]:
try:
nxt = asyncio.ensure_future(it.__anext__())
while True:
try:
yield await asyncio.wait_for(asyncio.shield(nxt), timeo)
nxt = asyncio.ensure_future(it.__anext__())
except asyncio.TimeoutError:
yield sentinel
except StopAsyncIteration:
pass
finally:
nxt.cancel() # in case we're getting cancelled our self
test:
async def something():
yield 1
await asyncio.sleep(1.1)
yield 2
await asyncio.sleep(2.1)
yield 3
async def test():
expect = [1, None, 2, None, None, 3]
async for item in timeout(something(), 1, None):
print("Check", item)
assert item == expect.pop(0)
asyncio.get_event_loop().run_until_complete(test())
When wait_for() times out it will cancel the task. Therefore, we need to wrap it.__anext__() in a task and then shield it, to be able to resume the iterator.
I want the coroutine to execute refresh at least every 60 seconds.
If you need to execute refresh every 60 seconds regardless of what happens with do_something_useful, you can arrange that with a separate coroutine:
import time
async def my_loop():
# ensure refresh() is invoked at least once in 60 seconds
done = False
async def repeat_refresh():
last_run = time.time()
while not done:
await refresh()
now = time.time()
await asyncio.sleep(max(60 - (now - last_run), 0))
last_run = now
# start repeat_refresh "in the background"
refresh_task = asyncio.get_event_loop().create_task(repeat_refresh())
try:
async for item in something():
if item is not None:
await do_something_useful(item)
await refresh()
finally:
done = True

Merging async iterables in python3

Is there a good way, or a well-supported library, for merging async iterators in python3?
The desired behavior is basically the same as that of merging observables in reactivex.
That is, in the normal case, if I'm merging two async iterator, I want the resulting async iterator to yield results chronologically. An error in one of the iterators should derail the merged iterator.
(Source: http://reactivex.io/documentation/operators/merge.html)
This is my best attempt, but it seems like something there might be a standard solution to:
async def drain(stream, q, sentinal=None):
try:
async for item in stream:
await q.put(item)
if sentinal:
await q.put(sentinal)
except BaseException as e:
await q.put(e)
async def merge(*streams):
q = asyncio.Queue()
sentinal = namedtuple("QueueClosed", ["truthy"])(True)
futures = {
asyncio.ensure_future(drain(stream, q, sentinal)) for stream in streams
}
remaining = len(streams)
while remaining > 0:
result = await q.get()
if result is sentinal:
remaining -= 1
continue
if isinstance(result, BaseException):
raise result
yield result
if __name__ == "__main__":
# Example: Should print:
# 1
# 2
# 3
# 4
loop = asyncio.get_event_loop()
async def gen():
yield 1
await asyncio.sleep(1.5)
yield 3
async def gen2():
await asyncio.sleep(1)
yield 2
await asyncio.sleep(1)
yield 4
async def go():
async for x in merge(gen(), gen2()):
print(x)
loop.run_until_complete(go())
You can use aiostream.stream.merge:
from aiostream import stream
async def go():
async for x in stream.merge(gen(), gen2()):
print(x)
More examples in the documentation and this answer.

Task top-up with asyncio

In my project, I have a list to tasks that I execute with.
loop.run_until_complete(tasks)
However, there is an infinite number of tasks, so at the moment, I execute them in batches. Essentially, I have this:
def get_results(tasks):
return [result for result in loop.run_until_complete(handle_tasks(tasks))]
while True:
tasks = get_tasks()
results = get_results(tasks)
I get a number of tasks, I lunch a regular function that uses a loop to perform these tasks asynchronously and returns the results.
This approach works, but I believe it can be improved.
Instead of doing batches of tasks, I would like to do some sort of task top-up.
Something like this:
while True:
if current_tasks < max_tasks:
new_tasks = get_tasks(max_tasks - current_tasks)
add_tasks(new_tasks)
current_tasks, results = stats_and_results()
I appreciate any ideas on how to approach this problem.
Thanks!
We had a similar problem and ended up writing a small "Pool" wrapper that takes jobs and run them with a predefined concurrency.
import asyncio
import sys
class Pool:
def __init__(self, concurrency):
self._sem = asyncio.BoundedSemaphore(concurrency)
self.jobs = []
async def __aenter__(self):
return self
async def __aexit__(self, *_):
if len(self.jobs) > 0:
await asyncio.wait(self.jobs)
def put(self, coro):
assert asyncio.iscoroutine(coro)
async def wrapped():
async with self._sem:
await coro
fut = asyncio.ensure_future(wrapped())
self.jobs.append(fut)
async def __aiter__(self):
return self
async def __anext__(self):
try:
coro = self.jobs.pop(0)
except IndexError:
raise StopAsyncIteration()
else:
return await coro
You can then use it this way:
async def main():
pool = Pool(10)
for task in get_tasks():
pool.put(task)
async for result in pool:
print('got', result)
This will schedule all the task, run at most 10 of them concurrently and return the results as they come to the main() coroutine

Resources