How to decorate an asyncio.coroutine to retain its __name__? - python-3.x

I've tried to write a decorator function which wraps an asyncio.coroutine and returns the time it took to get done. The recipe below contains the code which is working as I expected. My only problem with it that somehow I loose the name of the decorated function despite the use of #functools.wraps. How to retain the name of the original coroutine? I checked the source of asyncio.
import asyncio
import functools
import random
import time
MULTIPLIER = 5
def time_resulted(coro):
#functools.wraps(coro)
#asyncio.coroutine
def wrapper(*args, **kargs):
time_before = time.time()
result = yield from coro(*args, **kargs)
if result is not None:
raise TypeError('time resulted coroutine can '
'only return None')
return time_before, time.time()
print('= wrapper.__name__: {!r} ='.format(wrapper.__name__))
return wrapper
#time_resulted
#asyncio.coroutine
def random_sleep():
sleep_time = random.random() * MULTIPLIER
print('{} -> {}'.format(time.time(), sleep_time))
yield from asyncio.sleep(sleep_time)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
tasks = [asyncio.Task(random_sleep()) for i in range(5)]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
for task in tasks:
print(task, task.result()[1] - task.result()[0])
print('= random_sleep.__name__: {!r} ='.format(
random_sleep.__name__))
print('= random_sleep().__name__: {!r} ='.format(
random_sleep().__name__))
The result:
= wrapper.__name__: 'random_sleep' =
1397226479.00875 -> 4.261069174838891
1397226479.00875 -> 0.6596335046471768
1397226479.00875 -> 3.83421163259601
1397226479.00875 -> 2.5514027672929713
1397226479.00875 -> 4.497471439365472
Task(<wrapper>)<result=(1397226479.00875, 1397226483.274884)> 4.266134023666382
Task(<wrapper>)<result=(1397226479.00875, 1397226479.6697)> 0.6609499454498291
Task(<wrapper>)<result=(1397226479.00875, 1397226482.844265)> 3.835515022277832
Task(<wrapper>)<result=(1397226479.00875, 1397226481.562422)> 2.5536720752716064
Task(<wrapper>)<result=(1397226479.00875, 1397226483.51523)> 4.506479978561401
= random_sleep.__name__: 'random_sleep' =
= random_sleep().__name__: 'wrapper' =
As you can see random_sleep() returns a generator object with different name. I would like to retain the name of the decorated coroutine. I am not aware if this is problem is specific to asyncio.coroutines or not. I also tried the code with different decorator orders, but all has the same result. If I comment #functools.wraps(coro) then even random_sleep.__name__ becomes wrapper as I expected.
EDIT: I've posted this issue to Python Issue Tracker and received the following answer by R. David Murray: "I think this is a specific case of a more general need to improve 'wraps' that was discussed on python-dev not too long ago."

The issue is that functools.wraps changes only wrapper.__name__ and wrapper().__name__ stays wrapper. __name__ is a readonly generator attribute. You could use exec to set appropriate name:
import asyncio
import functools
import uuid
from textwrap import dedent
def wrap_coroutine(coro, name_prefix='__' + uuid.uuid4().hex):
"""Like functools.wraps but preserves coroutine names."""
# attribute __name__ is not writable for a generator, set it dynamically
namespace = {
# use name_prefix to avoid an accidental name conflict
name_prefix + 'coro': coro,
name_prefix + 'functools': functools,
name_prefix + 'asyncio': asyncio,
}
exec(dedent('''
def {0}decorator({0}wrapper_coro):
#{0}functools.wraps({0}coro)
#{0}asyncio.coroutine
def {wrapper_name}(*{0}args, **{0}kwargs):
{0}result = yield from {0}wrapper_coro(*{0}args, **{0}kwargs)
return {0}result
return {wrapper_name}
''').format(name_prefix, wrapper_name=coro.__name__), namespace)
return namespace[name_prefix + 'decorator']
Usage:
def time_resulted(coro):
#wrap_coroutine(coro)
def wrapper(*args, **kargs):
# ...
return wrapper
It works but there is probably a better way than using exec().

In the time since this question was asked, it became possible to change the name of a coroutine. It is done by setting __qualname__ (not __name__):
async def my_coro(): pass
c = my_coro()
print(repr(c))
# <coroutine object my_coro at 0x7ff8a7d52bc0>
c.__qualname__ = 'flimflam'
print(repr(c))
# <coroutine object flimflam at 0x7ff8a7d52bc0>
import asyncio
print(repr(asyncio.ensure_future(c)))
# <Task pending name='Task-737' coro=<flimflam() running at <ipython-input>:1>>
The usage of __qualname__ in a coroutine object's __repr__ is defined in the CPython source

Related

How to resolve pickle error caused by passing instance method to connurent.futures.ProcessPoolExecutor,submit() (multiprocessing in Python 3)?

I am really new to multiprocessing!
What I was trying to do:
Run a particular instance method i.e. ( wait_n_secs() which was slow!) as a separate process so that other processes can run on the side.
Once instance method is done processing we retrieve its output and use it using shared array provided by multiprocessing module.
Here is the code I was trying to run.
import cv2
import time
from multiprocessing import Array
import concurrent.futures
import copyreg as copy_reg
import types
def _pickle_method(m):
if m.im_self is None:
return getattr, (m.im_class, m.im_func.func_name)
else:
return getattr, (m.im_self, m.im_func.func_name)
copy_reg.pickle(types.MethodType, _pickle_method)
class Testing():
def __init__(self):
self.executor = concurrent.futures.ProcessPoolExecutor()
self.futures = None
self.shared_array = Array('i', 4)
def wait_n_secs(self,n):
print(f"I wait for {n} sec")
cv2.waitKey(n*1000)
wait_array = (n,n,n,n)
return wait_array
def function(waittime):
bbox = Testing().wait_n_secs(waittime)
return bbox
if __name__ =="__main__":
testing = Testing()
waittime = 5
# Not working!
testing.futures = testing.executor.submit(testing.wait_n_secs,waittime)
# Working!
#testing.futures = testing.executor.submit(function,waittime)
stime = time.time()
while 1:
if not testing.futures.running():
print("Checking for results")
testing.shared_array = testing.futures.result()
print("Shared_array received = ",testing.shared_array)
break
time_elapsed = time.time()-stime
if (( time_elapsed % 1 ) < 0.001):
print(f"Time elapsed since some time = {time_elapsed:.2f} sec")
Problems I faced:
1) Error on Python 3.6:
Traceback (most recent call last):
File "C:\Users\haide\AppData\Local\Programs\Python\Python36\lib\multiprocessing\queues.py", line 234, in _feed
obj = _ForkingPickler.dumps(obj)
File "C:\Users\haide\AppData\Local\Programs\Python\Python36\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "C:\Users\haide\AppData\Local\Programs\Python\Python36\lib\multiprocessing\queues.py", line 58, in __getstate__
context.assert_spawning(self)
File "C:\Users\haide\AppData\Local\Programs\Python\Python36\lib\multiprocessing\context.py", line 356, in assert_spawning
' through inheritance' % type(obj).__name__
RuntimeError: Queue objects should only be shared between processes through inheritance
2) Error on Python 3.8:
testing.shared_array = testing.futures.result()
File "C:\Users\haide\AppData\Local\Programs\Python\Python38\lib\concurrent\futures\_base.py", line 437, in result
return self.__get_result()
File "C:\Users\haide\AppData\Local\Programs\Python\Python38\lib\concurrent\futures\_base.py", line 389, in __get_result
raise self._exception
File "C:\Users\haide\AppData\Local\Programs\Python\Python38\lib\multiprocessing\queues.py", line 239, in _feed
obj = _ForkingPickler.dumps(obj)
File "C:\Users\haide\AppData\Local\Programs\Python\Python38\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot pickle 'weakref' object
As others like Amby, falviussn has previously asked.
Problem:
We get a pickling error specifically for instance methods in multiprocessing as they are unpickable.
Solution I tried (Partially):
The solution most mentioned is to use copy_reg to pickle the instance method.
I don't fully understand copy_reg. I have tried adding the lines of code to the top of mp.py provided by Nabeel. But I haven't got it to work.
(Important consideration): I am on Python 3 using copyreg and solutions seem to be using python 2 as they imported copy_reg (Python 2)
(I haven't tried):
Using dill because they were either not the case of multiprocessing or even if they were. They were not using concurrent.futures module.
Workaround:
Passing function that calls the instance method ( instead of the instance method directly ) to submit() method.
testing.futures = testing.executor.submit(function,waittime)
This does work. But does not seem like an elegant solution.
What I want:
Please guide me on how to correctly use copyreg as I clearly don't understand its workings.
Or
If it's a python3 issue, Suggest another solution where i can pass instance methods to conccurent.futurs.ProcessPoolExecutor.submit() for multi-processing. :)
Update #1:
#Aaron Can you share an example code of your solution? "passing a module level function that takes instance as an argument"
or
Correct my mistake here:
This was my attempt. :(
Passing the instance to the module level function along with the arguments
inp_args = [waittime]
testing.futures = testing.executor.submit(wrapper_func,testing,inp_args)
And this was the module wrapper function I created,
def wrapper_func(ins,*args):
ins.wait_n_secs(args)
This got me back to...
TypeError: cannot pickle 'weakref' object
We get a pickling error specifically for instance methods in multiprocessing as they are unpickable.
This is not true, instance methods are very much picklable in python 3 (unless they contain local attributes, like factory functions). You get the error because some other instance attributes (specific to your code) are not picklable.
Please guide me on how to correctly use copyreg as I clearly don't understand its workings.
It's not required here
If it's a python3 issue, Suggest another solution where i can pass instance methods to conccurent.futurs.ProcessPoolExecutor.submit() for multi-processing. :)
It's not really a python issue, it's to do with what data your sending to be pickled. Specifically, all three attributes (after they are populated), self.executor, self.futures and self.shared_array cannot be put on a multiprocessing.Queue (which ProcessPoolExecutor internally uses) and pickled.
So, the problem happens because you are passing an instance method as the target function, which means that all instance attributes are also implicitly pickled and sent to the other process. Since, some of these attributes are not picklable, this error is raised. This is also the reason why your workaround works, since the instance attributes are not pickled there as the target function is not an instance method. There are a couple of things you can do, the best way depends on if there are other attributes that you need to send as well.
Method #1
Judging from the sample code, your wait_n_secs function is not really using any instance attributes. Therefore, you can convert it into a staticmethod and pass that as the target function directly instead:
import time
from multiprocessing import Array
import concurrent.futures
class Testing():
def __init__(self):
self.executor = concurrent.futures.ProcessPoolExecutor()
self.futures = None
self.shared_array = Array('i', 4)
#staticmethod
def wait_n_secs(n):
print(f"I wait for {n} sec")
# Have your own implementation here
time.sleep(n)
wait_array = (n, n, n, n)
return wait_array
if __name__ == "__main__":
testing = Testing()
waittime = 5
testing.futures = testing.executor.submit(type(testing).wait_n_secs, waittime) # Notice the type(testing)
stime = time.time()
while 1:
if not testing.futures.running():
print("Checking for results")
testing.shared_array = testing.futures.result()
print("Shared_array received = ", testing.shared_array)
break
time_elapsed = time.time() - stime
if ((time_elapsed % 1) < 0.001):
print(f"Time elapsed since some time = {time_elapsed:.2f} sec")
Method #2
If your instance contains attributes which would be used by the target functions (so they can't be converted to staticmethods), then you can also explicitly not pass the unpicklable attributes of the instance when pickling using the __getstate__ method. This would mean that the instance recreated inside other processes would not have all these attributes either (since we did not pass them), so do keep that in mind:
import time
from multiprocessing import Array
import concurrent.futures
class Testing():
def __init__(self):
self.executor = concurrent.futures.ProcessPoolExecutor()
self.futures = None
self.shared_array = Array('i', 4)
def wait_n_secs(self, n):
print(f"I wait for {n} sec")
# Have your own implementation here
time.sleep(n)
wait_array = (n, n, n, n)
return wait_array
def __getstate__(self):
d = self.__dict__.copy()
# Delete all unpicklable attributes.
del d['executor']
del d['futures']
del d['shared_array']
return d
if __name__ == "__main__":
testing = Testing()
waittime = 5
testing.futures = testing.executor.submit(testing.wait_n_secs, waittime)
stime = time.time()
while 1:
if not testing.futures.running():
print("Checking for results")
testing.shared_array = testing.futures.result()
print("Shared_array received = ", testing.shared_array)
break
time_elapsed = time.time() - stime
if ((time_elapsed % 1) < 0.001):
print(f"Time elapsed since some time = {time_elapsed:.2f} sec")

Why serial code is faster than concurrent.futures in this case?

I am using the following code to process some pictures for my ML project and I would like to parallelize it.
import multiprocessing as mp
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
#parallel call of the function
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
ood_id = ex.map(track_ids, seqs)
if I run the code serially it takes 3.0 minutes but for parallel with concurrent, it takes 3.5 minutes.
can someone please explain why is that? and present a way to solve the problem.
btw, I have 12 cores.
Thanks
Here's a brief example of how one might go about profiling multiprocessing code vs serial execution:
from multiprocessing import Pool
from cProfile import Profile
from pstats import Stats
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
def profile_seq():
p = Profile() #one and only profiler instance
p.enable()
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
p.disable()
return Stats(p), seqs
def track_ids_pr(seq):
p = Profile() #profile the child tasks
p.enable()
retval = track_ids(seq)
p.disable()
return (Stats(p, stream="dummy"), retval)
def profile_parallel():
p = Profile() #profile stuff in the main process
p.enable()
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
retvals = ex.map(track_ids_pr, seqs)
p.disable()
s = Stats(p)
out = []
for ret in retvals:
s.add(ret[0])
out.append(ret[1])
return s, out
if __name__ == "__main__":
stat, retval = profile_parallel()
stat.print_stats()
EDIT: Unfortunately I found out that pstat.Stats objects cannot be used normally with multiprocessing.Queue because it is not pickleable (which is needed for the operation of concurrent.futures). Evidently it normally will store a reference to a file for the purpose of writing statistics to that file, and if none is given, it will by default grab a reference to sys.stdout. We don't actually need that reference however until we actually want to print out the statistics, so we can just give it a temporary value to prevent the pickle error, and then restore an appropriate value later. The following example should be copy-paste-able and run just fine rather than the pseudocode-ish example above.
from multiprocessing import Queue, Process
from cProfile import Profile
from pstats import Stats
import sys
def isprime(x):
for d in range(2, int(x**.5)):
if x % d == 0:
return False
return True
def foo(retq):
p = Profile()
p.enable()
primes = []
max_n = 2**20
for n in range(3, max_n):
if isprime(n):
primes.append(n)
p.disable()
retq.put(Stats(p, stream="dummy")) #Dirty hack: set `stream` to something picklable then override later
if __name__ == "__main__":
q = Queue()
p1 = Process(target=foo, args=(q,))
p1.start()
p2 = Process(target=foo, args=(q,))
p2.start()
s1 = q.get()
s1.stream = sys.stdout #restore original file
s2 = q.get()
# s2.stream #if we are just adding this `Stats` object to another the `stream` just gets thrown away anyway.
s1.add(s2) #add up the stats from both child processes.
s1.print_stats() #s1.stream gets used here, but not before. If you provide a file to write to instead of sys.stdout, it will write to that file)
p1.join()
p2.join()

How to use Multiprocessing pool in Databricks with pandas [duplicate]

I am sorry that I can't reproduce the error with a simpler example, and my code is too complicated to post. If I run the program in IPython shell instead of the regular Python, things work out well.
I looked up some previous notes on this problem. They were all caused by using pool to call function defined within a class function. But this is not the case for me.
Exception in thread Thread-3:
Traceback (most recent call last):
File "/usr/lib64/python2.7/threading.py", line 552, in __bootstrap_inner
self.run()
File "/usr/lib64/python2.7/threading.py", line 505, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib64/python2.7/multiprocessing/pool.py", line 313, in _handle_tasks
put(task)
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
I would appreciate any help.
Update: The function I pickle is defined at the top level of the module. Though it calls a function that contains a nested function. i.e, f() calls g() calls h() which has a nested function i(), and I am calling pool.apply_async(f). f(), g(), h() are all defined at the top level. I tried simpler example with this pattern and it works though.
Here is a list of what can be pickled. In particular, functions are only picklable if they are defined at the top-level of a module.
This piece of code:
import multiprocessing as mp
class Foo():
#staticmethod
def work(self):
pass
if __name__ == '__main__':
pool = mp.Pool()
foo = Foo()
pool.apply_async(foo.work)
pool.close()
pool.join()
yields an error almost identical to the one you posted:
Exception in thread Thread-2:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 552, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 505, in run
self.__target(*self.__args, **self.__kwargs)
File "/usr/lib/python2.7/multiprocessing/pool.py", line 315, in _handle_tasks
put(task)
PicklingError: Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
The problem is that the pool methods all use a mp.SimpleQueue to pass tasks to the worker processes. Everything that goes through the mp.SimpleQueue must be pickable, and foo.work is not picklable since it is not defined at the top level of the module.
It can be fixed by defining a function at the top level, which calls foo.work():
def work(foo):
foo.work()
pool.apply_async(work,args=(foo,))
Notice that foo is pickable, since Foo is defined at the top level and foo.__dict__ is picklable.
I'd use pathos.multiprocesssing, instead of multiprocessing. pathos.multiprocessing is a fork of multiprocessing that uses dill. dill can serialize almost anything in python, so you are able to send a lot more around in parallel. The pathos fork also has the ability to work directly with multiple argument functions, as you need for class methods.
>>> from pathos.multiprocessing import ProcessingPool as Pool
>>> p = Pool(4)
>>> class Test(object):
... def plus(self, x, y):
... return x+y
...
>>> t = Test()
>>> p.map(t.plus, x, y)
[4, 6, 8, 10]
>>>
>>> class Foo(object):
... #staticmethod
... def work(self, x):
... return x+1
...
>>> f = Foo()
>>> p.apipe(f.work, f, 100)
<processing.pool.ApplyResult object at 0x10504f8d0>
>>> res = _
>>> res.get()
101
Get pathos (and if you like, dill) here:
https://github.com/uqfoundation
When this problem comes up with multiprocessing a simple solution is to switch from Pool to ThreadPool. This can be done with no change of code other than the import-
from multiprocessing.pool import ThreadPool as Pool
This works because ThreadPool shares memory with the main thread, rather than creating a new process- this means that pickling is not required.
The downside to this method is that python isn't the greatest language with handling threads- it uses something called the Global Interpreter Lock to stay thread safe, which can slow down some use cases here. However, if you're primarily interacting with other systems (running HTTP commands, talking with a database, writing to filesystems) then your code is likely not bound by CPU and won't take much of a hit. In fact I've found when writing HTTP/HTTPS benchmarks that the threaded model used here has less overhead and delays, as the overhead from creating new processes is much higher than the overhead for creating new threads and the program was otherwise just waiting for HTTP responses.
So if you're processing a ton of stuff in python userspace this might not be the best method.
As others have said multiprocessing can only transfer Python objects to worker processes which can be pickled. If you cannot reorganize your code as described by unutbu, you can use dills extended pickling/unpickling capabilities for transferring data (especially code data) as I show below.
This solution requires only the installation of dill and no other libraries as pathos:
import os
from multiprocessing import Pool
import dill
def run_dill_encoded(payload):
fun, args = dill.loads(payload)
return fun(*args)
def apply_async(pool, fun, args):
payload = dill.dumps((fun, args))
return pool.apply_async(run_dill_encoded, (payload,))
if __name__ == "__main__":
pool = Pool(processes=5)
# asyn execution of lambda
jobs = []
for i in range(10):
job = apply_async(pool, lambda a, b: (a, b, a * b), (i, i + 1))
jobs.append(job)
for job in jobs:
print job.get()
print
# async execution of static method
class O(object):
#staticmethod
def calc():
return os.getpid()
jobs = []
for i in range(10):
job = apply_async(pool, O.calc, ())
jobs.append(job)
for job in jobs:
print job.get()
I have found that I can also generate exactly that error output on a perfectly working piece of code by attempting to use the profiler on it.
Note that this was on Windows (where the forking is a bit less elegant).
I was running:
python -m profile -o output.pstats <script>
And found that removing the profiling removed the error and placing the profiling restored it. Was driving me batty too because I knew the code used to work. I was checking to see if something had updated pool.py... then had a sinking feeling and eliminated the profiling and that was it.
Posting here for the archives in case anybody else runs into it.
Can't pickle <type 'function'>: attribute lookup __builtin__.function failed
This error will also come if you have any inbuilt function inside the model object that was passed to the async job.
So make sure to check the model objects that are passed doesn't have inbuilt functions. (In our case we were using FieldTracker() function of django-model-utils inside the model to track a certain field). Here is the link to relevant GitHub issue.
This solution requires only the installation of dill and no other libraries as pathos
def apply_packed_function_for_map((dumped_function, item, args, kwargs),):
"""
Unpack dumped function as target function and call it with arguments.
:param (dumped_function, item, args, kwargs):
a tuple of dumped function and its arguments
:return:
result of target function
"""
target_function = dill.loads(dumped_function)
res = target_function(item, *args, **kwargs)
return res
def pack_function_for_map(target_function, items, *args, **kwargs):
"""
Pack function and arguments to object that can be sent from one
multiprocessing.Process to another. The main problem is:
«multiprocessing.Pool.map*» or «apply*»
cannot use class methods or closures.
It solves this problem with «dill».
It works with target function as argument, dumps it («with dill»)
and returns dumped function with arguments of target function.
For more performance we dump only target function itself
and don't dump its arguments.
How to use (pseudo-code):
~>>> import multiprocessing
~>>> images = [...]
~>>> pool = multiprocessing.Pool(100500)
~>>> features = pool.map(
~... *pack_function_for_map(
~... super(Extractor, self).extract_features,
~... images,
~... type='png'
~... **options,
~... )
~... )
~>>>
:param target_function:
function, that you want to execute like target_function(item, *args, **kwargs).
:param items:
list of items for map
:param args:
positional arguments for target_function(item, *args, **kwargs)
:param kwargs:
named arguments for target_function(item, *args, **kwargs)
:return: tuple(function_wrapper, dumped_items)
It returs a tuple with
* function wrapper, that unpack and call target function;
* list of packed target function and its' arguments.
"""
dumped_function = dill.dumps(target_function)
dumped_items = [(dumped_function, item, args, kwargs) for item in items]
return apply_packed_function_for_map, dumped_items
It also works for numpy arrays.
A quick fix is to make the function global
from multiprocessing import Pool
class Test:
def __init__(self, x):
self.x = x
#staticmethod
def test(x):
return x**2
def test_apply(self, list_):
global r
def r(x):
return Test.test(x + self.x)
with Pool() as p:
l = p.map(r, list_)
return l
if __name__ == '__main__':
o = Test(2)
print(o.test_apply(range(10)))
Building on #rocksportrocker solution,
It would make sense to dill when sending and RECVing the results.
import dill
import itertools
def run_dill_encoded(payload):
fun, args = dill.loads(payload)
res = fun(*args)
res = dill.dumps(res)
return res
def dill_map_async(pool, fun, args_list,
as_tuple=True,
**kw):
if as_tuple:
args_list = ((x,) for x in args_list)
it = itertools.izip(
itertools.cycle([fun]),
args_list)
it = itertools.imap(dill.dumps, it)
return pool.map_async(run_dill_encoded, it, **kw)
if __name__ == '__main__':
import multiprocessing as mp
import sys,os
p = mp.Pool(4)
res = dill_map_async(p, lambda x:[sys.stdout.write('%s\n'%os.getpid()),x][-1],
[lambda x:x+1]*10,)
res = res.get(timeout=100)
res = map(dill.loads,res)
print(res)
As #penky Suresh has suggested in this answer, don't use built-in keywords.
Apparently args is a built-in keyword when dealing with multiprocessing
class TTS:
def __init__(self):
pass
def process_and_render_items(self):
multiprocessing_args = [{"a": "b", "c": "d"}, {"e": "f", "g": "h"}]
with ProcessPoolExecutor(max_workers=10) as executor:
# Using args here is fine.
future_processes = {
executor.submit(TTS.process_and_render_item, args)
for args in multiprocessing_args
}
for future in as_completed(future_processes):
try:
data = future.result()
except Exception as exc:
print(f"Generated an exception: {exc}")
else:
print(f"Generated data for comment process: {future}")
# Dont use 'args' here. It seems to be a built-in keyword.
# Changing 'args' to 'arg' worked for me.
def process_and_render_item(arg):
print(arg)
# This will print {"a": "b", "c": "d"} for the first process
# and {"e": "f", "g": "h"} for the second process.
PS: The tabs/spaces maybe a bit off.

python apply_async does not call method

I have a method which needs to process through a large database, that would take hours/days to dig through
The arguments are stored in a (long) list of which max X should be processed in one batch. The method does not need to return anything, yet i return "True" for "fun"...
The function is working perfectly when I'm iterating through it linearly (generating/appending the results in other tables not seen here), yet I am unable to get apply_async or map_async work. (it worked before in other projects)
Any hint of what might I be doing wrong would be appreciated, thanks in advance!
See code below:
import multiprocessing as mp
class mainClass:
#loads of stuff
def main():
multiprocess = True
batchSize = 35
mC = mainClass()
while True:
toCheck = [key for key, value in mC.lCheckSet.items()] #the tasks are stored in a dictionary, I'm referring to them with their keys, which I turn to a list here for iteration.
if multiprocess == False:
#this version works perfectly fine
for i in toCheck[:batchSize]:
mC.check(i)
else:
#the async version does not, either with apply_async...
with mp.Pool(processes = 8) as pool:
temp = [pool.apply_async(mC.check, args=(toCheck[n],)) for n in range(len(toCheck[:batchSize]))]
results = [t.get() for t in temp]
#...or as map_async
pool = mp.Pool(processes = 8)
temp = pool.map_async(mC.check, toCheck[:batchSize])
pool.close()
pool.join()
if __name__=="__main__":
main()
The "smell" here is that you are instantiating your maincClass on the main Process, just once, and then trying to call a method on it on the different processes - but note that when you pass mC.check to your process pool, it is a method already bound to the class instantiated in this process.
I'd guess there is where your problem lies. Although that could possibly work - and it does - I made this simplified version and it works as intended :
import multiprocessing as mp
import random, time
class MainClass:
def __init__(self):
self.value = 1
def check(self, arg):
time.sleep(random.uniform(0.01, 0.3))
print(id(self),self.value, arg)
def main():
mc = MainClass()
with mp.Pool(processes = 4) as pool:
temp = [pool.apply_async(mc.check, (i,)) for i in range(8)]
results = [t.get() for t in temp]
main()
(Have you tried just adding some prints to make sure the method is not running at all?)
So, the problem lies likely in some complex state in your MainClass that does not make it to the parallel processes in a good way. A possible work-around is to instantiate your mainclasses inside each process - that can be easily done since MultiProcessing allow you to get the current_process, and use this object as a namespace to keep data in the process instantiated in the worker Pool, across different calls to apply async.
So, create a new check function like the one bellow - and instead of instantiating your mainclass in the mainprocess, instantiate it inside each process in the pool:
import multiprocessing as mp
import random, time
def check(arg):
process = mp.current_process
if not hasattr(process, "main_class"):
process.main_class = MainClass()
process.main_class.check(arg)
class MainClass:
def __init__(self):
self.value = random.randrange(100)
def check(self, arg):
time.sleep(random.uniform(0.01, 0.3))
print(id(self),self.value, arg)
def main():
mc = MainClass()
with mp.Pool(processes = 2) as pool:
temp = [pool.apply_async(check, (i,)) for i in range(8)]
results = [t.get() for t in temp]
main()
I got to this question with the same problem, my apply_async calls not called at all, but the reason on my case was that the parameters number on apply_async call was different to the number on function declaration

how to cache asyncio coroutines

I am using aiohttp to make a simple HTTP request in python 3.4 like this:
response = yield from aiohttp.get(url)
The application requests the same URL over and over again so naturally I wanted to cache it. My first attempt was something like this:
#functools.lru_cache(maxsize=128)
def cached_request(url):
return aiohttp.get(url)
The first call to cached_request works fine, but in later calls I end up with None instead of the response object.
I am rather new to asyncio so I tried a lot of combinations of the asyncio.coroutine decorator, yield from and some other things, but none seemed to work.
So how does caching coroutines work?
Maybe a bit late, but I've started a new package that may help: https://github.com/argaen/aiocache. Contributions/comments are always welcome.
An example:
import asyncio
from collections import namedtuple
from aiocache import cached
from aiocache.serializers import PickleSerializer
Result = namedtuple('Result', "content, status")
#cached(ttl=10, serializer=PickleSerializer())
async def async_main():
print("First ASYNC non cached call...")
await asyncio.sleep(1)
return Result("content", 200)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
print(loop.run_until_complete(async_main()))
print(loop.run_until_complete(async_main()))
print(loop.run_until_complete(async_main()))
print(loop.run_until_complete(async_main()))
Note that as an extra, it can cache any python object into redis using Pickle serialization. In case you just want to work with memory, you can use the SimpleMemoryCache backend :).
An popular async version of lru_cache exist here: async_lru
To use functools.lru_cache with coroutines, the following code works.
class Cacheable:
def __init__(self, co):
self.co = co
self.done = False
self.result = None
self.lock = asyncio.Lock()
def __await__(self):
with (yield from self.lock):
if self.done:
return self.result
self.result = yield from self.co.__await__()
self.done = True
return self.result
def cacheable(f):
def wrapped(*args, **kwargs):
r = f(*args, **kwargs)
return Cacheable(r)
return wrapped
#functools.lru_cache()
#cacheable
async def foo():
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
return await resp.text()
The following is thread safe
class ThreadSafeCacheable:
def __init__(self, co):
self.co = co
self.done = False
self.result = None
self.lock = threading.Lock()
def __await__(self):
while True:
if self.done:
return self.result
if self.lock.acquire(blocking=False):
self.result = yield from self.co.__await__()
self.done = True
return self.result
else:
yield from asyncio.sleep(0.005)
I wrote a simple cache decorator myself:
def async_cache(maxsize=128):
cache = {}
def decorator(fn):
def wrapper(*args):
key = ':'.join(args)
if key not in cache:
if len(cache) >= maxsize:
del cache[cache.keys().next()]
cache[key] = yield from fn(*args)
return cache[key]
return wrapper
return decorator
#async_cache()
#asyncio.coroutine
def expensive_io():
....
This kind-of-works. But many aspects can probably be improved. For example: If the cached function is called a second time before the first call returns, it will execute a second time.
I'm not that familiar with aiohttp so I'm not sure of exactly what is happening that would cause Nones to be returned, but the lru_cache decorator will not work with async functions.
I use a decorator which does essentially the same thing; note that it is different to tobib's decorator above in that it will always return a future or a task, rather than the value:
from collections import OrderedDict
from functools import _make_key, wraps
def future_lru_cache(maxsize=128):
# support use as decorator without calling, for this case maxsize will
# not be an int
try:
real_max_size = int(maxsize)
except ValueError:
real_max_size = 128
cache = OrderedDict()
async def run_and_cache(func, args, kwargs):
"""Run func with the specified arguments and store the result
in cache."""
result = await func(*args, **kwargs)
cache[_make_key(args, kwargs, False)] = result
if len(cache) > real_max_size:
cache.popitem(False)
return result
def wrapper(func):
#wraps(func)
def decorator(*args, **kwargs):
key = _make_key(args, kwargs, False)
if key in cache:
# Some protection against duplicating calls already in
# progress: when starting the call cache the future, and if
# the same thing is requested again return that future.
if isinstance(cache[key], asyncio.Future):
return cache[key]
else:
f = asyncio.Future()
f.set_result(cache[key])
return f
else:
task = asyncio.Task(run_and_cache(func, args, kwargs))
cache[key] = task
return task
return decorator
if callable(maxsize):
return wrapper(maxsize)
else:
return wrapper
I used _make_key from functools as lru_cache does, I guess it's supposed to be private so probably better to copy it over.
This is how I think it's most easily done, using the built-in lru_cache and futures:
import asyncio
import functools
# parameterless decorator
def async_lru_cache_decorator(async_function):
#functools.lru_cache
def cached_async_function(*args, **kwargs):
coroutine = async_function(*args, **kwargs)
return asyncio.ensure_future(coroutine)
return cached_async_function
# decorator with options
def async_lru_cache(*lru_cache_args, **lru_cache_kwargs):
def async_lru_cache_decorator(async_function):
#functools.lru_cache(*lru_cache_args, **lru_cache_kwargs)
def cached_async_function(*args, **kwargs):
coroutine = async_function(*args, **kwargs)
return asyncio.ensure_future(coroutine)
return cached_async_function
return async_lru_cache_decorator
#async_lru_cache(maxsize=128)
async def your_async_function(...): ...
This is basically taking your original function and wrapping it so I can store the Coroutine it returns and convert it into a Future. This way, this can be treated as a regular function and you can lru_cache-it as you would usually do it.
Why is wrapping it in a Future necessary? Python coroutines are low level constructs and you can't await one more than once (You would get RuntimeError: cannot reuse already awaited coroutine). Futures, on the other hand, are handy and can be awaited consecutively and will return the same result.
One caveat is that caching a Future will also cache when the original functions raised an Error. The original lru_cache does not cache interrupted executions, so watch out for this edge case using the solution above.
Further tweaking can be done to merge both the parameter-less and the parameterized decorators, like the original lru_cache which supports both usages.
Another variant of lru decorator, which caches not yet finished coroutines, very useful with parallel requests to the same key:
import asyncio
from collections import OrderedDict
from functools import _make_key, wraps
def async_cache(maxsize=128, event_loop=None):
cache = OrderedDict()
if event_loop is None:
event_loop = asyncio.get_event_loop()
awaiting = dict()
async def run_and_cache(func, args, kwargs):
"""await func with the specified arguments and store the result
in cache."""
result = await func(*args, **kwargs)
key = _make_key(args, kwargs, False)
cache[key] = result
if len(cache) > maxsize:
cache.popitem(False)
cache.move_to_end(key)
return result
def decorator(func):
#wraps(func)
async def wrapper(*args, **kwargs):
key = _make_key(args, kwargs, False)
if key in cache:
return cache[key]
if key in awaiting:
task = awaiting[key]
return await asyncio.wait_for(task, timeout=None, loop=event_loop)
task = asyncio.ensure_future(run_and_cache(func, args, kwargs), loop=event_loop)
awaiting[key] = task
result = await asyncio.wait_for(task, timeout=None, loop=event_loop)
del awaiting[key]
return result
return wrapper
return decorator
async def test_async_cache(event_loop):
counter = 0
n, m = 10, 3
#async_cache(maxsize=n, event_loop=event_loop)
async def cached_function(x):
nonlocal counter
await asyncio.sleep(0) # making event loop switch to other coroutine
counter += 1
return x
tasks = [asyncio.ensure_future(cached_function(x), loop=event_loop)
for x in list(range(n)) * m]
done, pending = await asyncio.wait(tasks, loop=event_loop, timeout=1)
assert len(done) == n * m
assert counter == n
event_loop = asyncio.get_event_loop()
task = asyncio.ensure_future(test_async_cache(event_loop))
event_loop.run_until_complete(task)
I think that the simplest way is to use aiohttp_cache (documentation)
pip install aiohttp-cache
And use it in code:
from aiohttp_cache import cache, setup_cache
#cache() # <-- DECORATED FUNCTION
async def example_1(request):
return web.Response(text="Example")
app = web.Application()
app.router.add_route('GET', "/", example_1)
setup_cache(app) # <-- INITIALIZED aiohttp-cache
web.run_app(app, host="127.0.0.1")
Try async-cache :pypi async-cache :github for caching async functions in python.
It also supports function which have parameters of user defined or object type or unhashable type which is not supported in either functools.lru_cache or async_lru .
Usage:
pip install async-cache
from cache import AsyncLRU
#AsyncLRU(maxsize=128)
async def func(*args, **kwargs):
pass
I wrote a simple package named asyncio-cache - https://github.com/matan1008/asyncio-cache.
I tried to keep the code as close as possible to the original python implementation and as simple as possible.
For example:
from asyncio_cache import lru_cache
import aiohttp
#lru_cache(maxsize=128)
async def cached_get(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
return await resp.text()

Resources