Store future state on application object - python-3.x

Is it ok to store the state of future directly on application object? Example below
import asyncio
async def background():
await asyncio.sleep(1)
print('Doing something useful in the background')
await asyncio.sleep(1)
#aiohttp_jinja2.template('loading.html')
async def loading(request):
app = request.app
task = getattr(app, 'task_obj', None)
if task is None:
task = asyncio.ensure_future(background())
callback = partial(done_refresh, app)
task.add_done_callback(callback)
app.task_obj = task
def done_refresh(app, future):
if hasattr(app, 'task_obj'):
# Nice! Task is done
del app.refreshing
exc = future.exception()
if exc is not None:
# Task has some exception
print('Failed to update: %s', exc)
Usually, I store some marker like in_progress in Redis and then check for that value from whatever function I want, but that way I lose Task object itself and will not be able to access useful data like exception info.
What is the common approach to handle such cases?

Your approach makes perfect sense, except that the task should be stored in the aiohttp app context, instead of being set as an attribute (app['task_obj'] = ... instead of app.task_obj = ...)
see also https://docs.aiohttp.org/en/stable/web_advanced.html#data-sharing-aka-no-singletons-please

Related

Using asyncio for doing a/b testing in Python

Let's say there's some API that's running in production already and you created another API which you kinda want to A/B test using the incoming requests that's hitting the production-api. Now I was wondering, is it possible to do something like this, (I am aware of people doing traffic splits by keeping two different API versions for A/B testing etc)
As soon as you get the incoming request for your production-api, you make an async request to your new API and then carry on with the rest of the code for the production-api and then, just before returning the final response to the caller back, you check whether you have the results computed for that async task that you had created before. If it's available, then you return that instead of the current API.
I am wondering, what's the best way to do something like this? Do we try to write a decorator for this or something else? i am a bit worried about lot of edge cases that can happen if we use async here. Anyone has any pointers on making the code or the whole approach better?
Thanks for your time!
Some pseudo-code for the approach above,
import asyncio
def call_old_api():
pass
async def call_new_api():
pass
async def main():
task = asyncio.Task(call_new_api())
oldResp = call_old_api()
resp = await task
if task.done():
return resp
else:
task.cancel() # maybe
return oldResp
asyncio.run(main())
You can't just execute call_old_api() inside asyncio's coroutine. There's detailed explanation why here. Please, ensure you understand it, because depending on how your server works you may not be able to do what you want (to run async API on a sync server preserving the point of writing an async code, for example).
In case you understand what you're doing, and you have an async server, you can call the old sync API in thread and use a task to run the new API:
task = asyncio.Task(call_new_api())
oldResp = await in_thread(call_old_api())
if task.done():
return task.result() # here you should keep in mind that task.result() may raise exception if the new api request failed, but that's probably ok for you
else:
task.cancel() # yes, but you should take care of the cancelling, see - https://stackoverflow.com/a/43810272/1113207
return oldResp
I think you can go even further and instead of always waiting for the old API to be completed, you can run both APIs concurrently and return the first that's done (in case new api works faster than the old one). With all checks and suggestions above, it should look something like this:
import asyncio
import random
import time
from contextlib import suppress
def call_old_api():
time.sleep(random.randint(0, 2))
return "OLD"
async def call_new_api():
await asyncio.sleep(random.randint(0, 2))
return "NEW"
async def in_thread(func):
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, func)
async def ensure_cancelled(task):
task.cancel()
with suppress(asyncio.CancelledError):
await task
async def main():
old_api_task = asyncio.Task(in_thread(call_old_api))
new_api_task = asyncio.Task(call_new_api())
done, pending = await asyncio.wait(
[old_api_task, new_api_task], return_when=asyncio.FIRST_COMPLETED
)
if pending:
for task in pending:
await ensure_cancelled(task)
finished_task = done.pop()
res = finished_task.result()
print(res)
asyncio.run(main())

How to manage sessions with aiohttp?

I'm using aiohttp with asyncio to make a batch of requests. My first approach was to create a session inside the fetch() function (which starts an asyncio.gather job), and then passing the session object around to the functions that perform the post requests (get_info)
def batch_starter(item_list)
return_value = loop.run_until_complete(fetch(item_list))
return return_value
async def fetch(item_list):
async with aiohttp.ClientSession() as session: # <- session started here
results = await asyncio.gather(*[asyncio.ensure_future(get_info(session, item)) for item in item_list])
async def get_info(session, item): # <- session passed to the function
async with session.post("some_url", data={"id": item}) as resp:
html = await resp.json()
some_info = html.get('info')
return some_info
but thanks to my confusion, I am now leaning towards instantiating the session right away once the script is imported, like below, at the top of the file:
import asyncio
import aiohttp
import json
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
session = aiohttp.ClientSession() # <- session started at top of file
def batch_starter(item_list)
return_value = loop.run_until_complete(fetch(item_list))
return return_value
async def fetch(item_list):
results = await asyncio.gather(*[asyncio.ensure_future(get_info(item)) for item in item_list])
async def get_info(item):
async with session.post("some_url", data={"id": item}) as resp: # <- session from outer scope is used
html = await resp.json()
some_info = html.get('info')
return some_info
the docs explain that opening a session with every request is a "very bad" idea (obviously). But this is stated right after the example which does apparently exactly that (first approach)? Which one of this is correct, and how is the session going to behave when it is used like in the second approach, at the top of the file? wouldn't the session just stay open forever if I'm using the second approach?
The batch_starter() function is not going to be called a lot, but with 9000+ items in the item_list. I assumed this was already reducing the amount of sessions to 1 (per gather job), but apparently this is the "bad idea" example, and needs to be corrected? the docs are a bit unclear about this...

Python async CancelledError() with no details

The following code fails and I'm not able to get the actual error, I just get numerous CancelledError messages
import aiobotocore, asyncio
async def replicate_to_region(chunks, region):
session = aiobotocore.get_session()
client = session.create_client('dynamodb', region_name=region)
start = 0
while True:
chunk = chunks[start]
item = {'my_table': chunk}
response = await client.batch_write_item(RequestItems=item)
async def main():
asyncio.gather(*(replicate_to_region(payloads, region) for region in regions))
asyncio.run(main())
I get the following errors;
client_session: <aiohttp.client.ClientSession object at 0x7f6fb65a34a8>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f6fb64c82b0>
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
I've tried quite a number of variations of the replicate_to_region function but they all fail with the same error above. It would be useful just to be able to see what the actual error is.
async def main():
asyncio.gather(...)
asyncio.gather() is an awaitable itself:
awaitable asyncio.gather(*aws, loop=None, return_exceptions=False)
It means you should use await when deal with it:
async def main():
await asyncio.gather(*(replicate_to_region(payloads, region) for region in regions))
off-topic:
I didn't work with aiobotocore and not sure if it's important, but it's better to do as documentation says. In particular you should probably use async with when creating a client as example shows.

Handling ensure_future and its missing tasks

I have a streaming application that almost continuously takes the data given as input and sends an HTTP request using that value and does something with the returned value.
Obviously to speed things up I've used asyncio and aiohttp libraries in Python 3.7 to get the best performance, but it becomes hard to debug given how fast the data moves.
This is what my code looks like
'''
Gets the final requests
'''
async def apiRequest(info, url, session, reqType, post_data=''):
if reqType:
async with session.post(url, data = post_data) as response:
info['response'] = await response.text()
else:
async with session.get(url+post_data) as response:
info['response'] = await response.text()
logger.debug(info)
return info
'''
Loops through the batches and sends it for request
'''
async def main(data, listOfData):
tasks = []
async with ClientSession() as session:
for reqData in listOfData:
try:
task = asyncio.ensure_future(apiRequest(**reqData))
tasks.append(task)
except Exception as e:
print(e)
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
print(exc_type, fname, exc_tb.tb_lineno)
responses = await asyncio.gather(*tasks)
return responses #list of APIResponses
'''
Streams data in and prepares batches to send for requests
'''
async def Kconsumer(data, loop, batchsize=100):
consumer = AIOKafkaConsumer(**KafkaConfigs)
await consumer.start()
dataPoints = []
async for msg in consumer:
try:
sys.stdout.flush()
consumedMsg = loads(msg.value.decode('utf-8'))
if consumedMsg['tid']:
dataPoints.append(loads(msg.value.decode('utf-8')))
if len(dataPoints)==batchsize or time.time() - startTime>5:
'''
#1: The task below goes and sends HTTP GET requests in bulk using aiohttp
'''
task = asyncio.ensure_future(getRequests(data, dataPoints))
res = await asyncio.gather(*[task])
if task.done():
outputs = []
'''
#2: Does some ETL on the returned values
'''
ids = await asyncio.gather(*[doSomething(**{'tid':x['tid'],
'cid':x['cid'], 'tn':x['tn'],
'id':x['id'], 'ix':x['ix'],
'ac':x['ac'], 'output':to_dict(xmltodict.parse(x['response'],encoding='utf-8')),
'loop':loop, 'option':1}) for x in res[0]])
simplySaveDataIntoDataBase(id) # This is where I see some missing data in the database
dataPoints = []
except Exception as e:
logger.error(e)
logger.error(traceback.format_exc())
exc_type, exc_obj, exc_tb = sys.exc_info()
fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
logger.error(str(exc_type) +' '+ str(fname) +' '+ str(exc_tb.tb_lineno))
if __name__ == '__main__':
loop = asyncio.get_event_loop()
asyncio.ensure_future(Kconsumer(data, loop, batchsize=100))
loop.run_forever()
Does the ensure_future need to be awaited ?
How does aiohttp handle requests that come a little later than the others? Shouldn't it hold the whole batch back instead of forgetting about it altoghter?
Does the ensure_future need to be awaited ?
Yes, and your code is doing that already. await asyncio.gather(*tasks) awaits the provided tasks and returns their results in the same order.
Note that await asyncio.gather(*[task]) doesn't make sense, because it is equivalent to await asyncio.gather(task), which is again equivalent to await task. In other words, when you need the result of getRequests(data, dataPoints), you can write res = await getRequests(data, dataPoints) without the ceremony of first calling ensure_future() and then calling gather().
In fact, you almost never need to call ensure_future yourself:
if you need to await multiple tasks, you can pass coroutine objects directly to gather, e.g. gather(coroutine1(), coroutine2()).
if you need to spawn a background task, you can call asyncio.create_task(coroutine(...))
How does aiohttp handle requests that come a little later than the others? Shouldn't it hold the whole batch back instead of forgetting about it altoghter?
If you use gather, all requests must finish before any of them return. (That is not aiohttp policy, it's how gather works.) If you need to implement a timeout, you can use asyncio.wait_for or similar.

Wait for db future to complete?

I have written a code for sanic application, rethinkdb is being used as a backend database. I want to wait for rethinkdb connection function to intialise before other functions as they have dependency on rethinkdb connection.
My rethinkdb connection initialization function is:
async def open_connections(app):
logger.warning('opening database connection')
r.set_loop_type('asyncio')
connection= await r.connect(
port=app.config.DATABASE["port"],
host=app.config.DATABASE["ip"],
db=app.config.DATABASE["dbname"],
user=app.config.DATABASE["user"],
password=app.config.DATABASE["password"])
print (f"connection established {connection}")
return connection
The call back function which will be executed after future gets resolved is
def db_callback(future):
exc = future.exception()
if exc:
# Handle wonderful empty TimeoutError exception
logger.error(f"From mnemonic api isnt working with error {exc}")
sys.exit(1)
result = future.result()
return result
sanic app:
def main():
app = Sanic(__name__)
load_config(app)
zmq = ZMQEventLoop()
asyncio.set_event_loop(zmq)
server = app.create_server(
host=app.config.HOST, port=app.config.PORT, debug=app.config.DEBUG, access_log=True)
loop = asyncio.get_event_loop()
##not wait for the server to strat, this will return a future object
asyncio.ensure_future(server)
##not wait for the rethinkdb connection to initialize, this will return
##a future object
future = asyncio.ensure_future(open_connections(app))
result = future.add_done_callback(db_callback)
logger.debug(result)
future = asyncio.ensure_future(insert_mstr_account(app))
future.add_done_callback(insert_mstr_acc_callback)
future = asyncio.ensure_future(check_master_accounts(app))
future.add_done_callback(callbk_check_master_accounts)
signal(SIGINT, lambda s, f: loop.close())
try:
loop.run_forever()
except KeyboardInterrupt:
close_connections(app)
loop.stop()
When i start this app, the print statement in open_connections functions executes in the last.
future = asyncio.ensure_future(open_connections(app))
result = future.add_done_callback(db_callback)
ensure_future schedules coroutines concurrently
add_done_callback does not wait for the completion of the future, instead it simply schedules a function call after the future is completed. You can see it here
So you should explicitly await the open_connections future before performing other functions:
future = asyncio.ensure_future(open_connections(app))
future.add_done_callback(db_callback)
result = await future
EDITED: the answer above applies only to coroutine
In this case we want to wait for the completion of future in the function body. To do it we should use loop.run_until_complete
def main():
...
future = asyncio.ensure_future(open_connections(app))
future.add_done_callback(db_callback)
result = loop.run_until_complete(future)

Resources