Run asyncio loop in a separate thread - python-3.x

I have a component of an application that needs to run an IOLoop in a separate thread. I try to achieve that by creating an new IOLOOP in a background Thread and starting the loop. My original use case it keep scheduling a bunch of async tasks periodically.
To achieve this, I:
Create an event loop in a background thread.
Start the thread and call asyncio.run_coroutine_threadsafe(self._start, self._loop)
import asyncio
from contextlib import suppress
from threading import Thread
class AsyncScheduler(object):
"""
Async Schedule Class.
This class:
- Will run on a separate event loop on a separate thread.
- Will periodically(every minute) schedule tasks for Requester.
"""
def __init__(self, batch_manager, requester):
self._requester = requester
self._is_started = False
self._tasks = []
self._loop = None
self.start()
def start(self):
"""
Start a new event loop in a thread.
call eventloop.run(self._start)
:return:
"""
print("STARTING")
self._loop = asyncio.new_event_loop()
# start new loop in thread.
Thread(target=self._loop.run_forever).start()
asyncio.run_coroutine_threadsafe(self._start, self._loop)
def stop(self):
if self._loop:
# cancel tasks
self._loop.call_soon_threadsafe(self._stop)
# stop the loop.
self._loop.stop()
async def _start(self):
"""
Create three tasks for 3 API versions.
Schedule Each tasks on the event loop using
asyncio.gather.
:return:
"""
versions = [1, 2, 3]
print("ASYNC START")
if not self._is_started:
self._is_started = True
for version in versions:
self._tasks.append(
self.create_task(60, version)
)
await asyncio.gather(*self._tasks)
async def _stop(self):
for task in self._tasks:
task.cancel()
with suppress(asyncio.CancelledError):
await task
async def execute(self, api_version):
"""
This method gets the batch to be executed and
tells the requester to run it.
:param api_version:
:return:
"""
await self._requester.run()
async def create_task(self, sleep_time, api_version):
"""
Calls the tasks in infinite loop.
:param sleep_time:
:param api_version:
:return:
"""
while True:
print("EVER CALLED")
await self.execute(api_version)
await asyncio.sleep(sleep_time)
Steps done in the code:
Call start from init
In start, create an eventloop within a new thread and start the loop with an awaitable.
I thought this is the way to use an event loop inside a separate thread. Alas, but my awaitbale sel._start is never called and I get an error [A coroutine object is required]
Any ideas, what am I messing up here?
Thanks & Regards & Happy Thanksgiving to folks who celebrate.

Related

Process tasks in batchs in asyncio

I have got a funcion that generates tasks (io bound tasks):
def get_task():
while True:
new_task = _get_task()
if new_task is not None:
yield new_task
else:
sleep(1)
And I am trying to write a consumer in asyncio that will be processing max 10 tasks at the time and one task is finished then will take new one.
I am not sure if I should use semaphores or is there any kind of asycio pool executor? I started to write a pseudocode with threads:
def run(self)
while True:
self.semaphore.acquire() # first acquire, then get task
t = get_task()
self.process_task(t)
def process_task(self, task):
try:
self.execute_task(task)
self.mark_as_done(task)
except:
self.mark_as_failed(task)
self.semaphore.release()
Could anyone help me? I have no clue where to put async/await keywords
Simple task cap using asyncio.Sepmaphore
async def max10(task_generator):
semaphore = asyncio.Semaphore(10)
async def bounded(task):
async with semaphore:
return await task
async for task in task_generator:
asyncio.ensure_future(bounded(task))
The problem with this solution is that tasks are being drawn from the generator greedily. For example, if generator reads from a large database, the program could run out of memory.
Other than that it's idiomatic and well-behaved.
A solution, that uses async generator protocol to pull new tasks on demand:
async def max10(task_generator):
tasks = set()
gen = task_generator.__aiter__()
try:
while True:
while len(tasks) < 10:
tasks.add(await gen.__anext__())
_done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
except StopAsyncIteration:
await asyncio.gather(*tasks)
It may be considered sub-optimal, because it doesn't start executing tasks until 10 are available.
And here's concise and magic solution using worker pattern:
async def max10(task_generator):
async def worker():
async for task in task_generator:
await task
await asyncio.gather(*[worker() for i in range(10)])
It relies on a somewhat counter-intuitive property of being able to have multiple async iterators over the same async generator, in which case each generated item is seen by only one iterator.
My gut tells me that none of these solutions behaves properly on cancellation.
Async isn't threads. If for example you have tasks that are file IO bound then write them async using aiofiles
async with aiofiles.open('filename', mode='r') as f:
contents = await f.read()
Then replace task with your tasks. If you want to only run 10 at a time await asyncio.gather every 10 tasks.
import asyncio
async def task(x):
await asyncio.sleep(0.5)
print( x, "is done" )
async def run(loop):
futs = []
for x in range(50):
futs.append( task(x) )
await asyncio.gather( *futs )
loop = asyncio.get_event_loop()
loop.run_until_complete( run(loop) )
loop.close()
If you can't write the tasks async and need threads this is a basic example using asyncio's ThreadPoolExecutor. Note that with max_workers=5 only 5 tasks are run at a time.
import time
from concurrent.futures import ThreadPoolExecutor
import asyncio
def blocking(x):
time.sleep(1)
print( x, "is done" )
async def run(loop):
futs = []
executor = ThreadPoolExecutor(max_workers=5)
for x in range(15):
future = loop.run_in_executor(executor, blocking, x)
futs.append( future )
await asyncio.sleep(4)
res = await asyncio.gather( *futs )
loop = asyncio.get_event_loop()
loop.run_until_complete( run(loop) )
loop.close()
As pointed out by Dima Tismek, using semaphores to limit concurrency is vulnerable to exhausting task_generator too eagerly, since there is no backpressure between obtaining the tasks and submitting them to the event loop. A better option, also explored by the other answer, is not to spawn a task as soon as the generator has produced an item, but to create a fixed number of workers that exhaust the generator concurrently.
There are two areas where the code could be improved:
there is no need for a semaphore - it is superfluous when the number of tasks is fixed to begin with;
handling cancellation of generated tasks and of the throttling task.
Here is an implementation that tackles both issues:
async def throttle(task_generator, max_tasks):
it = task_generator.__aiter__()
cancelled = False
async def worker():
async for task in it:
try:
await task
except asyncio.CancelledError:
# If a generated task is canceled, let its worker
# proceed with other tasks - except if it's the
# outer coroutine that is cancelling us.
if cancelled:
raise
# other exceptions are propagated to the caller
worker_tasks = [asyncio.create_task(worker())
for i in range(max_tasks)]
try:
await asyncio.gather(*worker_tasks)
except:
# In case of exception in one worker, or in case we're
# being cancelled, cancel all workers and propagate the
# exception.
cancelled = True
for t in worker_tasks:
t.cancel()
raise
A simple test case:
async def mock_task(num):
print('running', num)
await asyncio.sleep(random.uniform(1, 5))
print('done', num)
async def mock_gen():
tnum = 0
while True:
await asyncio.sleep(.1 * random.random())
print('generating', tnum)
yield asyncio.create_task(mock_task(tnum))
tnum += 1
if __name__ == '__main__':
asyncio.run(throttle(mock_gen(), 3))

How to name thread for logging with concurrent.futures?

I am creating a webscraper that would scrape from multiple domains in different threads. As there are many different domains, I would like to be able to search logged info per each thread.
UPDATE: solution implemented in code. Follow # SOLUTION lines
The script has been set up as follows:
import logging
from queue import Queue, Empty
from threading import current_thread # SOLUTION
from concurrent.futures import ThreadPoolExecutor
logging.basicConfig(
format='%(threadName)s %(levelname)s: %(message)s',
level=logging.INFO
)
class Scraper:
def __init__(self, max_workers):
self.pool = ThreadPoolExecutor(max_workers = max_workers, thread_name_prefix='T')
self.to_crawl = Queue()
for task in self.setup_tasks(tasks=max_workers):
logging.info('Putting task to queue:\n{}'.format(task))
self.to_crawl.put(task)
logging.info('Queue size after init: {}'.format(self.to_crawl.qsize()))
def setup_tasks(self, cur, tasks=1):
# Prepare tasks for the queue
def run_task(self, task):
# Function for executing the task
current_thread().name = task['id'] # SOLUTION
logging.info('Executing task:\n{}'.format(task))
id = task['id'] # I want the task id to be reflected in the logging function for when run_task runds
def run_scraper(self):
while True:
logging.info('Launching new thread, queue size is {}'.format(self.to_crawl.qsize()))
try:
task = self.to_crawl.get()
self.pool.submit(self.run_task, task)
except Empty:
break
if __name__ == '__main__':
s = Scraper(max_workers=3)
s.run_scraper()
I would like to add the task['id'] to the logging formatting configuration instead of the given %(threadName)s without doing it manually each time the script logs something in run_task
Is there a way to assign task['id'] to the thread %(threadName)s when the thread takes the task in run_scraper?

python Websockets (asyncio ver) force close connection

I am coding for python >3.5.
I am using Websockets 6.0 library that is here:
https://github.com/aaugustin/websockets
I have been call them the asyncio Websockets since they are based on asyncio.
In my search there were a lot of "lost connections", but I am looking at how to cancel a current ws.recv().
A call to the .start() creates a helper thread to start the asynico event loop. Then the receive function start and calls the connect function and websocket ws is instanced. Then the receive functions works fall messages. When I am ready to stop, a .stop() is called. I was expecting the stop funciton to stop the awaited ws.recv(). Then with the keep_running flag set to false and running a ws.close(), I would expect the ws.recv() to end and the when keep_running loop to end. That is not what is happening. I see all three stops, but never the receive stop.
command is: stop
Do command is stopped
Stop 1
Stop 2
Stop 3
^CException ignored in: <module 'threading' from '/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py'>
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 1294, in _shutdown
t.join()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 1056, in join
self._wait_for_tstate_lock()
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 1072, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
KeyboardInterrupt
(pyalmondplus) Pauls-MBP:pyalmondplus paulenright$
Code for reference:
import threading
import asyncio
import websockets
import json
class PyAlmondPlus:
def __init__(self, api_url, event_callback=None):
self.api_url = api_url
self.ws = None
self.loop = asyncio.get_event_loop()
self.receive_task = None
self.event_callback = event_callback
self.keep_running = False
async def connect(self):
print("connecting")
if self.ws is None:
print("opening socket")
self.ws = await websockets.connect(self.api_url)
print(self.ws)
async def disconnect(self):
pass
async def send(self, message):
pass
async def receive(self):
print("receive started")
while self.keep_running:
if self.ws is None:
await self.connect()
recv_data = await self.ws.recv()
print(recv_data)
print("receive ended")
def start(self):
self.keep_running = True
print("Start 1")
print("Start 2")
t = threading.Thread(target=self.start_loop, args=())
print("Start 3")
t.start()
print("Receiver running")
def start_loop(self):
print("Loop helper 1")
policy = asyncio.get_event_loop_policy()
policy.set_event_loop(policy.new_event_loop())
self.loop = asyncio.get_event_loop()
self.loop.set_debug(True)
asyncio.set_event_loop(self.loop)
self.loop.run_until_complete(self.receive())
print("Loop helper 2")
def stop(self):
print("Stop 1")
self.keep_running = False
print("Stop 2")
self.ws.close()
print("Stop 3")
I am looking at how to cancel a current ws.recv() [...] I see all three stops, but never the receive stop.
Your receive coroutine is likely suspended waiting for some data to arrive, so it's not in a position to check the keep_running flag.
The easy and robust way to stop a running coroutine is to cancel the asyncio Task that drives it. That will immediately un-suspend the coroutine and make whatever it was waiting for raise a CancelledError. When using cancel you don't need a keep_running flag at all, the exception will terminate the loop automatically.
A call to the .start() creates a helper thread to start the asynico event loop.
This works, but you don't really need a new thread and a whole new event loop for each instance of PyAlmondPlus. Asyncio is designed to run inside a single thread, so one event loop instance can host any number of coroutines.
Here is a possible design that implements both ideas (not tested with actual web sockets):
# pre-start a single thread that runs the asyncio event loop
bgloop = asyncio.new_event_loop()
_thread = threading.Thread(target=bgloop.run_forever)
_thread.daemon = True
_thread.start()
class PyAlmondPlus:
def __init__(self, api_url):
self.api_url = api_url
self.ws = None
async def connect(self):
if self.ws is None:
self.ws = await websockets.connect(self.api_url)
async def receive(self):
# keep_running is not needed - cancel the task instead
while True:
if self.ws is None:
await self.connect()
recv_data = await self.ws.recv()
async def init_receive_task(self):
self.receive_task = bgloop.create_task(self.receive())
def start(self):
# use run_coroutine_threadsafe to safely submit a coroutine
# to the event loop running in a different thread
init_done = asyncio.run_coroutine_threadsafe(
self.init_receive_task(), bgloop)
# wait for the init coroutine to actually finish
init_done.result()
def stop(self):
# Cancel the running task. Since the event loop is in a
# background thread, request cancellation with
# call_soon_threadsafe.
bgloop.call_soon_threadsafe(self.receive_task.cancel)

Asyncio worker that handles N jobs at a time?

I'm trying to make an asyncio worker class that will consume jobs from a job queue and process up to N jobs in parallel. Some jobs may queue additional jobs. When the job queue is empty and the worker finishes all of its current jobs, it should end.
I'm still struggling with asyncio conceptually. Here is one of my attempts, where N=3:
import asyncio, logging, random
async def do_work(id_):
await asyncio.sleep(random.random())
return id_
class JobQueue:
''' Maintains a list of all pendings jobs. '''
def __init__(self):
self._queue = asyncio.Queue()
self._max_id = 10
for id_ in range(self._max_id):
self._queue.put_nowait(id_ + 1)
def add_job(self):
self._max_id += 1
self._queue.put_nowait(self._max_id)
async def get_job(self):
return await self._queue.get()
def has_jobs(self):
return self._queue.qsize() > 0
class JobWorker:
''' Processes up to 3 jobs at a time in parallel. '''
def __init__(self, job_queue):
self._current_jobs = set()
self._job_queue = job_queue
self._semaphore = asyncio.Semaphore(3)
async def run(self):
while self._job_queue.has_jobs() or len(self._current_jobs) > 0:
print('Acquiring semaphore...')
await self._semaphore.acquire()
print('Getting a job...')
job_id = await self._job_queue.get_job()
print('Scheduling job {}'.format(job_id))
self._current_jobs.add(job_id)
task = asyncio.Task(do_work(job_id))
task.add_done_callback(self.task_finished)
def task_finished(self, task):
job_id = task.result()
print('Finished job {} / released semaphore'.format(job_id))
self._current_jobs.remove(job_id)
self._semaphore.release()
if random.random() < 0.2:
print('Queuing a new job')
self._job_queue.add_job()
loop = asyncio.get_event_loop()
jw = JobWorker(JobQueue())
print('Starting event loop')
loop.run_until_complete(jw.run())
print('Event loop ended')
loop.close()
An excerpt of the output:
Starting event loop
Acquiring semaphore...
Getting a job...
Scheduling job 1
Acquiring semaphore...
Getting a job...
Scheduling job 2
Acquiring semaphore...
Getting a job...
Scheduling job 3
Acquiring semaphore...
Finished job 2 / released semaphore
Getting a job...
Scheduling job 4
...snip...
Acquiring semaphore...
Finished job 11 / released semaphore
Getting a job...
Finished job 12 / released semaphore
Finished job 13 / released semaphore
It appears to correctly process all jobs while processing no more than 3 jobs at any one time. However, the program hangs after the last job is finished. As indicated by the output, it appears to be hanging at job_id = await self._job_queue.get_job(). Once the job queue is empty, this coroutine will never resume, and the check to see if the job queue is empty (at the top of the loop) isn't reached again.
I've tried working around this in a number of ways but conceptually something just don't quite fit. My current WIP is passing some futures between the queue and the worker and then using some combination of asyncio.wait(...) on all of them, but it's getting ugly and I'm wondering if there is an elegant solution that I'm overlooking.
You could take advantage of queue.task_done that indicates that a formerly enqueued task is complete. Then you can combine queue.join and queue.get using asyncio.wait: if queue.join finishes and queue.get doesn't, this means all the jobs have been completed.
See this example:
class Worker:
def __init__(self, func, n=3):
self.func = func
self.queue = asyncio.Queue()
self.semaphore = asyncio.Semaphore(n)
def put(self, *args):
self.queue.put_nowait(args)
async def run(self):
while True:
args = await self._get()
if args is None:
return
asyncio.ensure_future(self._target(args))
async def _get(self):
get_task = asyncio.ensure_future(self.queue.get())
join_task = asyncio.ensure_future(self.queue.join())
await asyncio.wait([get_task, join_task], return_when='FIRST_COMPLETED')
if get_task.done():
return task.result()
async def _target(self, args):
try:
async with self.semaphore:
return await self.func(*args)
finally:
self.queue.task_done()
You can timeout get_job with simple asyncio.wait_for. For example with 1s, and get back to the beginning of loop on timeout.
async def run(self):
while self._job_queue.has_jobs() or len(self._current_jobs) > 0:
print('Acquiring semaphore...')
await self._semaphore.acquire()
print('Getting a job...')
try:
job_id = await asyncio.wait_for(self._job_queue.get_job(), 1)
except asyncio.TimeoutError:
continue
print('Scheduling job {}'.format(job_id))
self._current_jobs.add(job_id)
task = asyncio.Task(do_work(job_id))
task.add_done_callback(self.task_finished)

Multi-threaded asyncio in Python

I'm currently doing my first steps with asyncio in Python 3.5 and there is one problem that's bugging me. Obviously I haven't fully understood coroutines...
Here is a simplified version of what I'm doing.
In my class I have an open() method that creates a new thread. Within that thread I create a new event loop and a socket connection to some host. Then I let the loop run forever.
def open(self):
# create thread
self.thread = threading.Thread(target=self._thread)
self.thread.start()
# wait for connection
while self.protocol is None:
time.sleep(0.1)
def _thread(self):
# create loop, connection and run forever
self.loop = asyncio.new_event_loop()
coro = self.loop.create_connection(lambda: MyProtocol(self.loop),
'somehost.com', 1234)
self.loop.run_until_complete(coro)
self.loop.run_forever()
Stopping the connection is now quite simple, I just stop the loop from the main thread:
loop.call_soon_threadsafe(loop.stop)
Unfortunately I need to do some cleanup, especially I need to empty a queue before disconnecting from the server. So I tried something like this stop() method in MyProtocol:
class MyProtocol(asyncio.Protocol):
def __init__(self, loop):
self._loop = loop
self._queue = []
async def stop(self):
# wait for all queues to empty
while self._queue:
await asyncio.sleep(0.1)
# disconnect
self.close()
self._loop.stop()
The queue gets emptied from within the protocol's data_received() method, so I just want to wait for that to happen using the while loop with the asyncio.sleep() call. Afterwards I close the connection and stop the loop.
But how do I call this method from the main thread and wait for it?
I tried the following, but none of them seem to work (protocol is the currently used instance of MyProtocol):
loop.call_soon_threadsafe(protocol.stop)
loop.call_soon_threadsafe(functools.partial(asyncio.ensure_future, protocol.stop(), loop=loop))
asyncio.ensure_future(protocol.stop(), loop=loop)
Can anyone please help me here? Thanks!
Basically you want to schedule coroutine on loop of different thread. You could use run_coroutine_threadsafe:
future = asyncio.run_coroutine_threadsafe(protocol.stop, loop=loop)
future.result() # wait for results
Or the old style async like in https://stackoverflow.com/a/32084907/681044
import asyncio
from threading import Thread
loop = asyncio.new_event_loop()
def f(loop):
asyncio.set_event_loop(loop)
loop.run_forever()
t = Thread(target=f, args=(loop,))
t.start()
#asyncio.coroutine
def g():
yield from asyncio.sleep(1)
print('Hello, world!')
loop.call_soon_threadsafe(asyncio.async, g())

Resources