Multiprocessing with multiprocessing.Pool - python-3.x

I am having issue with even the most basic task using mutiprocessing.Tool method.
It seems to be working but never finish the simplest task.
Could you please help what am I doing wrong?
I read some articles, tried to understand it, but could figure it out. I added a short example (with list(map(squared, range(2_000_000))), it works, but not the below.)
Thanks in advance,
Roland
"""
from multiprocessing import Pool
import time
process_pool = Pool(processes = 4)
def squared(n):
return n ** 2
start = time.perf_counter()
process_pool.apply(squared, range(2_000_000))
end = time.perf_counter() - start
print(f"Run time: {end}")
"""

It seems to be a case of multithread..... Have you tried something like:
from concurrent.futures import ThreadPoolExecutor, as_completed
num_of_threads = 50 # Number of threads executing at the same time
with ThreadPoolExecutor(max_workers=num_of_threads) as executor:
tasks = []
for i in i_list:
tasks.append(
executor.submit(
<Function_to_execute>, i
)
)
for future in as_completed(tasks):
if future.result():
yield future.result() # Here can be just a return, yield you return a generator

I think you want imap() (and move squared() before you define the Pool):
from multiprocessing import Pool
import time
def squared(n):
return n ** 2
process_pool = Pool(processes = 4)
start = time.perf_counter()
process_pool.imap(squared, range(2))
end = time.perf_counter() - start
print(f"Run time: {end}")
just keep in mind this is not a very representative example, since you dont do anything with the results; something better would be
with Pool(4) as pool:
results = pool.imap(squared, range(2_000_000))
for result in results:
pass # do something here with the result

Related

Why serial code is faster than concurrent.futures in this case?

I am using the following code to process some pictures for my ML project and I would like to parallelize it.
import multiprocessing as mp
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
#parallel call of the function
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
ood_id = ex.map(track_ids, seqs)
if I run the code serially it takes 3.0 minutes but for parallel with concurrent, it takes 3.5 minutes.
can someone please explain why is that? and present a way to solve the problem.
btw, I have 12 cores.
Thanks
Here's a brief example of how one might go about profiling multiprocessing code vs serial execution:
from multiprocessing import Pool
from cProfile import Profile
from pstats import Stats
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
def profile_seq():
p = Profile() #one and only profiler instance
p.enable()
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
p.disable()
return Stats(p), seqs
def track_ids_pr(seq):
p = Profile() #profile the child tasks
p.enable()
retval = track_ids(seq)
p.disable()
return (Stats(p, stream="dummy"), retval)
def profile_parallel():
p = Profile() #profile stuff in the main process
p.enable()
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
retvals = ex.map(track_ids_pr, seqs)
p.disable()
s = Stats(p)
out = []
for ret in retvals:
s.add(ret[0])
out.append(ret[1])
return s, out
if __name__ == "__main__":
stat, retval = profile_parallel()
stat.print_stats()
EDIT: Unfortunately I found out that pstat.Stats objects cannot be used normally with multiprocessing.Queue because it is not pickleable (which is needed for the operation of concurrent.futures). Evidently it normally will store a reference to a file for the purpose of writing statistics to that file, and if none is given, it will by default grab a reference to sys.stdout. We don't actually need that reference however until we actually want to print out the statistics, so we can just give it a temporary value to prevent the pickle error, and then restore an appropriate value later. The following example should be copy-paste-able and run just fine rather than the pseudocode-ish example above.
from multiprocessing import Queue, Process
from cProfile import Profile
from pstats import Stats
import sys
def isprime(x):
for d in range(2, int(x**.5)):
if x % d == 0:
return False
return True
def foo(retq):
p = Profile()
p.enable()
primes = []
max_n = 2**20
for n in range(3, max_n):
if isprime(n):
primes.append(n)
p.disable()
retq.put(Stats(p, stream="dummy")) #Dirty hack: set `stream` to something picklable then override later
if __name__ == "__main__":
q = Queue()
p1 = Process(target=foo, args=(q,))
p1.start()
p2 = Process(target=foo, args=(q,))
p2.start()
s1 = q.get()
s1.stream = sys.stdout #restore original file
s2 = q.get()
# s2.stream #if we are just adding this `Stats` object to another the `stream` just gets thrown away anyway.
s1.add(s2) #add up the stats from both child processes.
s1.print_stats() #s1.stream gets used here, but not before. If you provide a file to write to instead of sys.stdout, it will write to that file)
p1.join()
p2.join()

Slow multiprocessing when parent object contains large data

Consider the following snippet:
import numpy as np
import multiprocessing as mp
import time
def work_standalone(args):
return 2
class Worker:
def __init__(self):
self.data = np.random.random(size=(10000, 10000))
# leave a trace whenever init is called
with open('rnd-%d' % np.random.randint(100), 'a') as f:
f.write('init called\n')
def work_internal(self, args):
return 2
def _run(self, target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
for res in result:
pass
def run_internal(self):
self._run(self.work_internal)
def run_standalone(self):
self._run(work_standalone)
if __name__ == '__main__':
t1 = time.time()
Worker().run_standalone()
t2 = time.time()
print(f'Standalone took {t2 - t1:.3f} seconds')
t3 = time.time()
Worker().run_internal()
t4 = time.time()
print(f'Internal took {t3 - t4:.3f} seconds')
I.e. we have an object containing a large variable that uses multiprocessing to parallelize some work that has nothing to do with that large variable, i.e. does not read from or write to. The location of the worker process has a huge impact on the runtime:
Standalone took 0.616 seconds
Internal took 19.917 seconds
Why is this happening? I am completely lost. Note that __init__ is only called twice, so the random data is not created for every new process in the pool. The only reason I can think of why this would be slow is that data is copied around, but that would not make sense since it is never used anywhere, and python is supposed to use copy-on-write semantics. Also note that the difference disappears if you make run_internal a static method.
The issue you have is due to the target you are calling from the pool. That target is the function with the reference to Worker instance.
Now, you're right that the __init__() is only called twice. But remember, when you send anything to and from the processes, python will need to pickle the data first.
So, because your target is self.work_internal(), python has to pickle the Worker() instance every time the imap is called. This leads to one issue, self.data being copied over again and again.
The following is the proof. I just added 1 "input" statements, and fixed the last time of time calculation.
import numpy as np
import multiprocessing as mp
import time
def work_standalone(args):
return 2
class Worker:
def __init__(self):
self.data = np.random.random(size=(10000, 10000))
# leave a trace whenever init is called
with open('rnd-%d' % np.random.randint(100), 'a') as f:
f.write('init called\n')
def work_internal(self, args):
return 2
def _run(self, target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
input("Wait for analysis")
for res in result:
pass
def run_internal(self):
self._run(self.work_internal)
# self._run(work_standalone)
def run_standalone(self):
self._run(work_standalone)
def work_internal(target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
for res in result:
pass
if __name__ == '__main__':
t1 = time.time()
Worker().run_standalone()
t2 = time.time()
print(f'Standalone took {t2 - t1:.3f} seconds')
t3 = time.time()
Worker().run_internal()
t4 = time.time()
print(f'Internal took {t4 - t3:.3f} seconds')
You can run the code, when it shows up "wait for analysis", go and check the memory usage.
Like so
Then on the second time you see the message, press enter. And observe the memory usage increasing and decreasing again.
On the other hand, if you change self._run(self.work_internal) to self._run(work_standalone) you would notice that the speed is very fast, and the memory is not increasing, as well as the time taken is a lot shorter than doing self.work_internal.
Solution
One way to solve your issue is to set self.data as a static class variable. In normal cases, this would prevent instances from having to copy/reinit the variable again. This also prevented the issue from occuring.
class Worker:
data = np.random.random(size=(10000, 10000))
def __init__(self):
pass
...

How do I process several lists at once?

I have a big list of numbers. I want to split that big list of numbers into x number of lists and process them in parallel.
Here's the code that I have so far:
from multiprocessing import Pool
import numpy
def processNumList(numList):
for num in numList:
outputList.append(num ** 2)
numThreads = 5
bigNumList = list(range(50))
splitNumLists = numpy.array_split(bigNumList, numThreads)
outputList = []
for numList in splitNumLists:
processNumList(numList)
print(outputList)
The above code does the following:
Splits a big list of numbers into the specified number of smaller lists
Passes each of those lists to the processNumList function
Prints the result list afterwards
Everything there works as expected, but it only processes one list at a time. I want every list to be processed simultaneously.
What is the proper code to do that? I experimented with pool but could never seem to get it working.
You could try something like this:
import threading
class MyClass(threading.Thread):
def __init__(self):
# init stuff
def run(self, arg, arg2):
# your logic to process the list
# split the list as you already did
for _ in range(numThreads):
MyThread(arg, arg2).start()
Here's the code I ended up using.
I used threading.Thread() to process the lists asynchronously and then called thread.join() to ensure that all of the threads were finished before moving on.
I added time.sleep for demonstration purposes (to simulate a lengthy task), but obviously you wouldn't want to use that in production code.
import numpy
import threading
import time
def process_num_list(numList):
for num in numList:
output_list.append(num ** 2)
time.sleep(1)
num_threads = 5
big_num_list = list(range(30))
split_num_lists = numpy.array_split(big_num_list, num_threads)
output_list = []
threads = []
for num_list in split_num_lists:
thread = threading.Thread(target=process_num_list, args=[num_list])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
print(output_list)
As a bonus, here's a working example of five Selenium windows:
from selenium import webdriver
import numpy
import threading
import time
def scrapeSites(siteList):
print("Preparing to scrape " + str(len(siteList)) + " sites")
driver = webdriver.Chrome(executable_path = r"..\chromedriver.exe")
driver.set_window_size(700, 400)
for site in siteList:
print("\nNow scraping " + site)
driver.get(site)
pageTitles.append(driver.title)
driver.quit()
numThreads = 5
fullWebsiteList = ["https://en.wikipedia.org/wiki/Special:Random"] * 30
splitWebsiteLists = numpy.array_split(fullWebsiteList, numThreads)
pageTitles = []
threads = []
for websiteList in splitWebsiteLists:
thread = threading.Thread(target=scrapeSites, args=[websiteList])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
print(pageTitles)

Writing an EventLoop without using asyncio

I'm getting very familiar with python's asyncio, the asynchronous programming in python, co-routines etc.
I want to be able to executing several co-routines with my own custom made eventloop.
I'm curious if i can write my own eventloop without importing asyncio at all
I want to be able to executing several co-routines with my own custom made eventloop.
The asyncio event loop is well-tested and can be easily extended to acknowledge non-asyncio events. If you describe the actual use case, it might be easier to help. But if your goal is to learn about async programming and coroutines, read on.
I'm curious if i can write my own eventloop without importing asyncio at all
It's definitely possible - asyncio itself is just a library, after all - but it will take some work for your event loop to be useful. See this excellent talk by David Beazley where he demonstrates writing an event loop in front of a live audience. (Don't be put off by David using the older yield from syntax - await works exactly the same way.)
Ok, so i found an example somewhere (sorry, don't remember where, no link), and changed a little bit.
An eventloop and co-routins without even importing asyncio:
import datetime
import heapq
import types
import time
class Task:
def __init__(self, wait_until, coro):
self.coro = coro
self.waiting_until = wait_until
def __eq__(self, other):
return self.waiting_until == other.waiting_until
def __lt__(self, other):
return self.waiting_until < other.waiting_until
class SleepingLoop:
def __init__(self, *coros):
self._new = coros
self._waiting = []
def run_until_complete(self):
# Start all the coroutines.
for coro in self._new:
wait_for = coro.send(None)
heapq.heappush(self._waiting, Task(wait_for, coro))
# Keep running until there is no more work to do.
while self._waiting:
now = datetime.datetime.now()
# Get the coroutine with the soonest resumption time.
task = heapq.heappop(self._waiting)
if now < task.waiting_until:
# We're ahead of schedule; wait until it's time to resume.
delta = task.waiting_until - now
time.sleep(delta.total_seconds())
now = datetime.datetime.now()
try:
# It's time to resume the coroutine.
wait_until = task.coro.send(now)
heapq.heappush(self._waiting, Task(wait_until, task.coro))
except StopIteration:
# The coroutine is done.
pass
#types.coroutine
def async_sleep(seconds):
now = datetime.datetime.now()
wait_until = now + datetime.timedelta(seconds=seconds)
actual = yield wait_until
return actual - now
async def countdown(label, total_seconds_wait, *, delay=0):
print(label, 'waiting', delay, 'seconds before starting countdown')
delta = await async_sleep(delay)
print(label, 'starting after waiting', delta)
while total_seconds_wait:
print(label, 'T-minus', total_seconds_wait)
waited = await async_sleep(1)
total_seconds_wait -= 1
print(label, 'lift-off!')
def main():
loop = SleepingLoop(countdown('A', 5, delay=0),
countdown('B', 3, delay=2),
countdown('C', 4, delay=1))
start = datetime.datetime.now()
loop.run_until_complete()
print('Total elapsed time is', datetime.datetime.now() - start)
if __name__ == '__main__':
main()

Python multiprocessing script partial output

I am following the principles laid down in this post to safely output the results which will eventually be written to a file. Unfortunately, the code only print 1 and 2, and not 3 to 6.
import os
import argparse
import pandas as pd
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist):
for par in parlist:
queue.put(par)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
try:
par=queueIn.get(block=False)
res=doCalculation(par)
queueOut.put((res))
queueIn.task_done()
except:
break
def doCalculation(par):
return par
def write(queue):
while True:
try:
par=queue.get(block=False)
print("response:",par)
except:
break
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
On running the code it prints,
$ python3 tst.py
Queue size 6
response: 1
response: 2
Also, is it possible to ensure that the write function always outputs 1,2,3,4,5,6 i.e. in the same order in which the data is fed into the feed queue?
The error is somehow with the task_done() call. If you remove that one, then it works, don't ask me why (IMO that's a bug). But the way it works then is that the queueIn.get(block=False) call throws an exception because the queue is empty. This might be just enough for your use case, a better way though would be to use sentinels (as suggested in the multiprocessing docs, see last example). Here's a little rewrite so your program uses sentinels:
import os
import argparse
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist, nthreads):
for par in parlist:
queue.put(par)
for i in range(nthreads):
queue.put(None)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
par=queueIn.get()
if par is None:
break
res=doCalculation(par)
queueOut.put((res))
def doCalculation(par):
return par
def write(queue):
while not queue.empty():
par=queue.get()
print("response:",par)
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod, nthreads))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
A few things to note:
the sentinel is putting a None into the queue. Note that you need one sentinel for every worker process.
for the write function you don't need to do the sentinel handling as there's only one process and you don't need to handle concurrency (if you would do the empty() and then get() thingie in your calc function you would run into a problem if e.g. there's only one item left in the queue and both workers check empty() at the same time and then both want to do get() and then one of them is locked forever)
you don't need to put feed and write into processes, just put them into your main function as you don't want to run it in parallel anyway.
how can I have the same order in output as in input? [...] I guess multiprocessing.map can do this
Yes map keeps the order. Rewriting your program into something simpler (as you don't need the workerQueue and writerQueue and adding random sleeps to prove that the output is still in order:
from multiprocessing import Pool
import time
import random
def calc(val):
time.sleep(random.random())
return val
if __name__ == "__main__":
considerperiod=[1,2,3,4,5,6]
with Pool(processes=2) as pool:
print(pool.map(calc, considerperiod))

Resources