Asyncio worker that handles N jobs at a time? - python-3.5

I'm trying to make an asyncio worker class that will consume jobs from a job queue and process up to N jobs in parallel. Some jobs may queue additional jobs. When the job queue is empty and the worker finishes all of its current jobs, it should end.
I'm still struggling with asyncio conceptually. Here is one of my attempts, where N=3:
import asyncio, logging, random
async def do_work(id_):
await asyncio.sleep(random.random())
return id_
class JobQueue:
''' Maintains a list of all pendings jobs. '''
def __init__(self):
self._queue = asyncio.Queue()
self._max_id = 10
for id_ in range(self._max_id):
self._queue.put_nowait(id_ + 1)
def add_job(self):
self._max_id += 1
self._queue.put_nowait(self._max_id)
async def get_job(self):
return await self._queue.get()
def has_jobs(self):
return self._queue.qsize() > 0
class JobWorker:
''' Processes up to 3 jobs at a time in parallel. '''
def __init__(self, job_queue):
self._current_jobs = set()
self._job_queue = job_queue
self._semaphore = asyncio.Semaphore(3)
async def run(self):
while self._job_queue.has_jobs() or len(self._current_jobs) > 0:
print('Acquiring semaphore...')
await self._semaphore.acquire()
print('Getting a job...')
job_id = await self._job_queue.get_job()
print('Scheduling job {}'.format(job_id))
self._current_jobs.add(job_id)
task = asyncio.Task(do_work(job_id))
task.add_done_callback(self.task_finished)
def task_finished(self, task):
job_id = task.result()
print('Finished job {} / released semaphore'.format(job_id))
self._current_jobs.remove(job_id)
self._semaphore.release()
if random.random() < 0.2:
print('Queuing a new job')
self._job_queue.add_job()
loop = asyncio.get_event_loop()
jw = JobWorker(JobQueue())
print('Starting event loop')
loop.run_until_complete(jw.run())
print('Event loop ended')
loop.close()
An excerpt of the output:
Starting event loop
Acquiring semaphore...
Getting a job...
Scheduling job 1
Acquiring semaphore...
Getting a job...
Scheduling job 2
Acquiring semaphore...
Getting a job...
Scheduling job 3
Acquiring semaphore...
Finished job 2 / released semaphore
Getting a job...
Scheduling job 4
...snip...
Acquiring semaphore...
Finished job 11 / released semaphore
Getting a job...
Finished job 12 / released semaphore
Finished job 13 / released semaphore
It appears to correctly process all jobs while processing no more than 3 jobs at any one time. However, the program hangs after the last job is finished. As indicated by the output, it appears to be hanging at job_id = await self._job_queue.get_job(). Once the job queue is empty, this coroutine will never resume, and the check to see if the job queue is empty (at the top of the loop) isn't reached again.
I've tried working around this in a number of ways but conceptually something just don't quite fit. My current WIP is passing some futures between the queue and the worker and then using some combination of asyncio.wait(...) on all of them, but it's getting ugly and I'm wondering if there is an elegant solution that I'm overlooking.

You could take advantage of queue.task_done that indicates that a formerly enqueued task is complete. Then you can combine queue.join and queue.get using asyncio.wait: if queue.join finishes and queue.get doesn't, this means all the jobs have been completed.
See this example:
class Worker:
def __init__(self, func, n=3):
self.func = func
self.queue = asyncio.Queue()
self.semaphore = asyncio.Semaphore(n)
def put(self, *args):
self.queue.put_nowait(args)
async def run(self):
while True:
args = await self._get()
if args is None:
return
asyncio.ensure_future(self._target(args))
async def _get(self):
get_task = asyncio.ensure_future(self.queue.get())
join_task = asyncio.ensure_future(self.queue.join())
await asyncio.wait([get_task, join_task], return_when='FIRST_COMPLETED')
if get_task.done():
return task.result()
async def _target(self, args):
try:
async with self.semaphore:
return await self.func(*args)
finally:
self.queue.task_done()

You can timeout get_job with simple asyncio.wait_for. For example with 1s, and get back to the beginning of loop on timeout.
async def run(self):
while self._job_queue.has_jobs() or len(self._current_jobs) > 0:
print('Acquiring semaphore...')
await self._semaphore.acquire()
print('Getting a job...')
try:
job_id = await asyncio.wait_for(self._job_queue.get_job(), 1)
except asyncio.TimeoutError:
continue
print('Scheduling job {}'.format(job_id))
self._current_jobs.add(job_id)
task = asyncio.Task(do_work(job_id))
task.add_done_callback(self.task_finished)

Related

Run asyncio loop in a separate thread

I have a component of an application that needs to run an IOLoop in a separate thread. I try to achieve that by creating an new IOLOOP in a background Thread and starting the loop. My original use case it keep scheduling a bunch of async tasks periodically.
To achieve this, I:
Create an event loop in a background thread.
Start the thread and call asyncio.run_coroutine_threadsafe(self._start, self._loop)
import asyncio
from contextlib import suppress
from threading import Thread
class AsyncScheduler(object):
"""
Async Schedule Class.
This class:
- Will run on a separate event loop on a separate thread.
- Will periodically(every minute) schedule tasks for Requester.
"""
def __init__(self, batch_manager, requester):
self._requester = requester
self._is_started = False
self._tasks = []
self._loop = None
self.start()
def start(self):
"""
Start a new event loop in a thread.
call eventloop.run(self._start)
:return:
"""
print("STARTING")
self._loop = asyncio.new_event_loop()
# start new loop in thread.
Thread(target=self._loop.run_forever).start()
asyncio.run_coroutine_threadsafe(self._start, self._loop)
def stop(self):
if self._loop:
# cancel tasks
self._loop.call_soon_threadsafe(self._stop)
# stop the loop.
self._loop.stop()
async def _start(self):
"""
Create three tasks for 3 API versions.
Schedule Each tasks on the event loop using
asyncio.gather.
:return:
"""
versions = [1, 2, 3]
print("ASYNC START")
if not self._is_started:
self._is_started = True
for version in versions:
self._tasks.append(
self.create_task(60, version)
)
await asyncio.gather(*self._tasks)
async def _stop(self):
for task in self._tasks:
task.cancel()
with suppress(asyncio.CancelledError):
await task
async def execute(self, api_version):
"""
This method gets the batch to be executed and
tells the requester to run it.
:param api_version:
:return:
"""
await self._requester.run()
async def create_task(self, sleep_time, api_version):
"""
Calls the tasks in infinite loop.
:param sleep_time:
:param api_version:
:return:
"""
while True:
print("EVER CALLED")
await self.execute(api_version)
await asyncio.sleep(sleep_time)
Steps done in the code:
Call start from init
In start, create an eventloop within a new thread and start the loop with an awaitable.
I thought this is the way to use an event loop inside a separate thread. Alas, but my awaitbale sel._start is never called and I get an error [A coroutine object is required]
Any ideas, what am I messing up here?
Thanks & Regards & Happy Thanksgiving to folks who celebrate.

Why python asyncio code stucks on the first concurrent task?

During the asyncio learning and tests, I've wrote the code below with 3 concurrent tasks.
import asyncio
from time import time
tasks_to_schedule = []
task_queue = []
class Test():
def __init__(self, task_name, repeat_every):
self.name = task_name
self.repeat_every = repeat_every
self.scheduled = 0
def schedule(self, t_now):
self.scheduled = t_now
async def run(self):
print(f'It is {self.name}')
print(f'{self.name} running...')
await asyncio.sleep(2)
print(f'{self.name} finished')
def check_result(self):
pass
async def report(self):
print(f'{self.name} report DONE')
await asyncio.sleep(1)
def prepare_tasks():
task_a = Test('Task A', 2)
task_b = Test('Task B', 4)
tasks_to_schedule.append(task_a)
tasks_to_schedule.append(task_b)
async def scheduler():
turn = 0
while turn < 5:
if tasks_to_schedule:
print(f'***\t Turn {turn} \t***')
task = tasks_to_schedule.pop(0)
if task.scheduled < time():
task_queue.append(task)
print(f'adding task {task.name} to queue,\n queue size = {len(task_queue)}')
turn += 1
else:
tasks_to_schedule.append(task)
await asyncio.sleep(1)
async def worker(name):
while True:
if task_queue:
task = task_queue.pop(0)
print(f'Worker {name} - took task {task.name}')
await task.run()
await task.report()
print(f'Worker {name} - task {task.name} completed, reschedule it')
task.schedule(time())
tasks_to_schedule.append(task)
# await asyncio.sleep(1) #Process stuck without this line
async def main():
task_scheduler = asyncio.create_task(scheduler())
worker1 = asyncio.create_task(worker(1))
worker2 = asyncio.create_task(worker(2))
await asyncio.gather(task_scheduler, worker1, worker2)
if __name__ == '__main__':
prepare_tasks()
asyncio.run(main())
The problem in process stuck after "Task A running...", the only output is:
*** Turn 0 ***
adding task Task A to queue,
queue size = 1
Worker 1 - took task Task A
It is Task A
Task A running...
After several tries, I've noticed, that with the additional "await asyncio.sleep(1)" line in the end of the loop inside "worker" func the process run correctly without any stuck.
I wonder, what is the reason?
Could someone explain me, please, why this additional line change everything?
Platform: Python 3.9.4, Windows 10 x64, inside venv.
I've added an additional line after:
async def worker(name):
while True:
print(f'{strftime("%X")}: worker loop') #this line
and I can see an endless worker loop in the output...
Now I see, the worker can't find task...
Solved :)

Getting returning value from multithreading in python 3

I'm trying to get one or several returning values from a thread in a multithreading process. The code I show get cycled with no way to interrupt it with Ctrl-C, Ctrl+D.
import queue as Queue
import threading
class myThread (threading.Thread):
def __init__(self, threadID, name, region):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.region = region
def run(self):
GetSales(self.region)
def GetSales(strReg):
print("Thread-" + strReg)
return "Returning-" + strReg
def Main():
RegionList = []
RegionList.append("EMEA")
RegionList.append("AP")
RegionList.append("AM")
# Create threads
threads = []
x = 0
for region in RegionList:
x += 1
rthread = myThread(x, "Thread-" + region, region) # Create new thread
rthread.start() # Start new thread
threads.append(rthread) # Add new thread to threads list
que = Queue.Queue()
# Wait for all threads to complete
for t in threads:
t.join()
result = que.get()
print(t.name + " -> Done")
Main()
If I comment line "result = que.get()" the program runs with no issues.
What you are looking for is future and async management.
Firstly, your program loop indefinitely because of the line que.get(), because there is nothing in the queue, it wait that something happen, which will never happen. You don't use it.
What you want to do is an async task and get the result :
import asyncio
async def yourExpensiveTask():
// some long calculation
return 42
async main():
tasks = []
tasks += [asyncio.create_task(yourExpensiveTask())]
tasks += [asyncio.create_task(yourExpensiveTask())]
for task in tasks:
result = await task
print(result)
See also https://docs.python.org/3/library/asyncio-task.html

Process tasks in batchs in asyncio

I have got a funcion that generates tasks (io bound tasks):
def get_task():
while True:
new_task = _get_task()
if new_task is not None:
yield new_task
else:
sleep(1)
And I am trying to write a consumer in asyncio that will be processing max 10 tasks at the time and one task is finished then will take new one.
I am not sure if I should use semaphores or is there any kind of asycio pool executor? I started to write a pseudocode with threads:
def run(self)
while True:
self.semaphore.acquire() # first acquire, then get task
t = get_task()
self.process_task(t)
def process_task(self, task):
try:
self.execute_task(task)
self.mark_as_done(task)
except:
self.mark_as_failed(task)
self.semaphore.release()
Could anyone help me? I have no clue where to put async/await keywords
Simple task cap using asyncio.Sepmaphore
async def max10(task_generator):
semaphore = asyncio.Semaphore(10)
async def bounded(task):
async with semaphore:
return await task
async for task in task_generator:
asyncio.ensure_future(bounded(task))
The problem with this solution is that tasks are being drawn from the generator greedily. For example, if generator reads from a large database, the program could run out of memory.
Other than that it's idiomatic and well-behaved.
A solution, that uses async generator protocol to pull new tasks on demand:
async def max10(task_generator):
tasks = set()
gen = task_generator.__aiter__()
try:
while True:
while len(tasks) < 10:
tasks.add(await gen.__anext__())
_done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
except StopAsyncIteration:
await asyncio.gather(*tasks)
It may be considered sub-optimal, because it doesn't start executing tasks until 10 are available.
And here's concise and magic solution using worker pattern:
async def max10(task_generator):
async def worker():
async for task in task_generator:
await task
await asyncio.gather(*[worker() for i in range(10)])
It relies on a somewhat counter-intuitive property of being able to have multiple async iterators over the same async generator, in which case each generated item is seen by only one iterator.
My gut tells me that none of these solutions behaves properly on cancellation.
Async isn't threads. If for example you have tasks that are file IO bound then write them async using aiofiles
async with aiofiles.open('filename', mode='r') as f:
contents = await f.read()
Then replace task with your tasks. If you want to only run 10 at a time await asyncio.gather every 10 tasks.
import asyncio
async def task(x):
await asyncio.sleep(0.5)
print( x, "is done" )
async def run(loop):
futs = []
for x in range(50):
futs.append( task(x) )
await asyncio.gather( *futs )
loop = asyncio.get_event_loop()
loop.run_until_complete( run(loop) )
loop.close()
If you can't write the tasks async and need threads this is a basic example using asyncio's ThreadPoolExecutor. Note that with max_workers=5 only 5 tasks are run at a time.
import time
from concurrent.futures import ThreadPoolExecutor
import asyncio
def blocking(x):
time.sleep(1)
print( x, "is done" )
async def run(loop):
futs = []
executor = ThreadPoolExecutor(max_workers=5)
for x in range(15):
future = loop.run_in_executor(executor, blocking, x)
futs.append( future )
await asyncio.sleep(4)
res = await asyncio.gather( *futs )
loop = asyncio.get_event_loop()
loop.run_until_complete( run(loop) )
loop.close()
As pointed out by Dima Tismek, using semaphores to limit concurrency is vulnerable to exhausting task_generator too eagerly, since there is no backpressure between obtaining the tasks and submitting them to the event loop. A better option, also explored by the other answer, is not to spawn a task as soon as the generator has produced an item, but to create a fixed number of workers that exhaust the generator concurrently.
There are two areas where the code could be improved:
there is no need for a semaphore - it is superfluous when the number of tasks is fixed to begin with;
handling cancellation of generated tasks and of the throttling task.
Here is an implementation that tackles both issues:
async def throttle(task_generator, max_tasks):
it = task_generator.__aiter__()
cancelled = False
async def worker():
async for task in it:
try:
await task
except asyncio.CancelledError:
# If a generated task is canceled, let its worker
# proceed with other tasks - except if it's the
# outer coroutine that is cancelling us.
if cancelled:
raise
# other exceptions are propagated to the caller
worker_tasks = [asyncio.create_task(worker())
for i in range(max_tasks)]
try:
await asyncio.gather(*worker_tasks)
except:
# In case of exception in one worker, or in case we're
# being cancelled, cancel all workers and propagate the
# exception.
cancelled = True
for t in worker_tasks:
t.cancel()
raise
A simple test case:
async def mock_task(num):
print('running', num)
await asyncio.sleep(random.uniform(1, 5))
print('done', num)
async def mock_gen():
tnum = 0
while True:
await asyncio.sleep(.1 * random.random())
print('generating', tnum)
yield asyncio.create_task(mock_task(tnum))
tnum += 1
if __name__ == '__main__':
asyncio.run(throttle(mock_gen(), 3))

Printing a Response To A Partially Completed Python Async Event Loop While Still Completing The Task After The Response

I'm working with Sanic, but I'm a bit stuck. I'm calling on 3 different API's each having their own response time.
I want to create a timeout function that provides a acceptable time for each task to return. But if the time task isn't complete within the acceptable time I want to return partial data as I don't need a complete data set and speed is more of a focus.
However, i want to keep the unfinished task working until completion (ie. requesting the API data and inserting into a Postgres DB.
I'm wondering if we can do this without using some kind of scheduler to keep the task running within the background.
But if the time task isn't complete within the acceptable time I want
to return partial data as I don't need a complete data set and speed
is more of a focus.
However, i want to keep the unfinished task working until completion
So other tasks are independent from timeouted task's state, right? If I understood you correctly you just want to run 3 asyncio.Task with their own timeouts and to aggregate their results at the end.
Only possible problem I see is "want to return partial data" since it may very vary on how things organized, but we can probably just pass this "partial data" with cancelled exception raised inside task on timeout.
Here's little prototype:
import asyncio
class PartialData(Exception):
def __init__(self, data):
super().__init__()
self.data = data
async def api_job(i):
data = 'job {i}:'.format(i=i)
try:
await asyncio.sleep(1)
data += ' step 1,'
await asyncio.sleep(2)
data += ' step 2,'
await asyncio.sleep(2)
data += ' step 3.'
except asyncio.CancelledError as exc:
raise PartialData(data) # Pass partial data to outer code with our exception.
else:
return data
async def api_task(i, timeout):
"""Wrapper for api_job to run it with timeout and retrieve it's partial data on timeout."""
t = asyncio.ensure_future(api_job(i))
try:
await asyncio.wait_for(t, timeout)
except asyncio.TimeoutError:
try:
await t
except PartialData as exc:
return exc.data # retrieve partial data on timeout and return it.
else:
return t.result()
async def main():
# Run 3 jobs with different timeouts:
results = await asyncio.gather(
api_task(1, timeout=2),
api_task(2, timeout=4),
api_task(3, timeout=6),
)
# Print results including "partial data":
for res in results:
print(res)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
Output:
job 1: step 1,
job 2: step 1, step 2,
job 3: step 1, step 2, step 3.
(as you can see first two jobs finished with timeouts and only part of their's datas retrieved)
Upd:
Complex example contain possible solutions to different events:
import asyncio
from contextlib import suppress
async def stock1(_):
await asyncio.sleep(1)
return 'stock1 res'
async def stock2(exception_in_2):
await asyncio.sleep(1)
if exception_in_2:
raise ValueError('Exception in stock2!')
await asyncio.sleep(1)
return 'stock2 res'
async def stock3(_):
await asyncio.sleep(3)
return 'stock3 res'
async def main():
# Vary this values to see different situations:
timeout = 2.5
exception_in_2 = False
# To run all three stocks just create tasks for them:
tasks = [
asyncio.ensure_future(s(exception_in_2))
for s
in (stock1, stock2, stock3)
]
# Now we just wait until one of this possible situations happened:
# 1) Everything done
# 2) Exception occured in one of tasks
# 3) Timeout occured and at least two tasks ready
# 4) Timeout occured and less than two tasks ready
# ( https://docs.python.org/3/library/asyncio-task.html#asyncio.wait )
await asyncio.wait(
tasks,
timeout=timeout,
return_when=asyncio.FIRST_EXCEPTION
)
is_success = all(t.done() and not t.exception() for t in tasks)
is_exception = any(t.done() and t.exception() for t in tasks)
is_good_timeout = \
not is_success and \
not is_exception and \
sum(t.done() for t in tasks) >= 2
is_bad_timeout = \
not is_success and \
not is_exception and \
sum(t.done() for t in tasks) < 2
# If success, just print all results:
if is_success:
print('All done before timeout:')
for t in tasks:
print(t.result())
# If timeout, but at least two done,
# print it leaving pending task to be executing.
# But note two important things:
# 1) You should guarantee pending task done before loop closed
# 2) What if pending task will finish with error, is it ok?
elif is_good_timeout:
print('Timeout, but enought tasks done:')
for t in tasks:
if t.done():
print(t.result())
# Timeout and not enought tasks done,
# let's just cancel all hanging:
elif is_bad_timeout:
await cancel_and_retrieve(tasks)
raise RuntimeError('Timeout and not enought tasks done') # You probably want indicate fail
# If any of tasks is finished with an exception,
# we should probably cancel unfinished tasks,
# await all tasks done and retrive all exceptions to prevent warnings
# ( https://docs.python.org/3/library/asyncio-dev.html#detect-exceptions-never-consumed )
elif is_exception:
await cancel_and_retrieve(tasks)
raise RuntimeError('Exception in one of tasks') # You probably want indicate fail
async def cancel_and_retrieve(tasks):
"""
Cancel all pending tasks, retrieve all exceptions
( https://docs.python.org/3/library/asyncio-dev.html#detect-exceptions-never-consumed )
It's cleanup function if we don't want task being continued.
"""
for t in tasks:
if not t.done():
t.cancel()
await asyncio.wait(
tasks,
return_when=asyncio.ALL_COMPLETED
)
for t in tasks:
with suppress(Exception):
await t
if __name__ == '__main__':
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main())
finally:
# If some tasks still pending (is_good_timeout case),
# let's kill them:
loop.run_until_complete(
cancel_and_retrieve(asyncio.Task.all_tasks())
)
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()

Resources