I am using the following code to process some pictures for my ML project and I would like to parallelize it.
import multiprocessing as mp
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
#parallel call of the function
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
ood_id = ex.map(track_ids, seqs)
if I run the code serially it takes 3.0 minutes but for parallel with concurrent, it takes 3.5 minutes.
can someone please explain why is that? and present a way to solve the problem.
btw, I have 12 cores.
Thanks
Here's a brief example of how one might go about profiling multiprocessing code vs serial execution:
from multiprocessing import Pool
from cProfile import Profile
from pstats import Stats
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
def profile_seq():
p = Profile() #one and only profiler instance
p.enable()
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
p.disable()
return Stats(p), seqs
def track_ids_pr(seq):
p = Profile() #profile the child tasks
p.enable()
retval = track_ids(seq)
p.disable()
return (Stats(p, stream="dummy"), retval)
def profile_parallel():
p = Profile() #profile stuff in the main process
p.enable()
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
retvals = ex.map(track_ids_pr, seqs)
p.disable()
s = Stats(p)
out = []
for ret in retvals:
s.add(ret[0])
out.append(ret[1])
return s, out
if __name__ == "__main__":
stat, retval = profile_parallel()
stat.print_stats()
EDIT: Unfortunately I found out that pstat.Stats objects cannot be used normally with multiprocessing.Queue because it is not pickleable (which is needed for the operation of concurrent.futures). Evidently it normally will store a reference to a file for the purpose of writing statistics to that file, and if none is given, it will by default grab a reference to sys.stdout. We don't actually need that reference however until we actually want to print out the statistics, so we can just give it a temporary value to prevent the pickle error, and then restore an appropriate value later. The following example should be copy-paste-able and run just fine rather than the pseudocode-ish example above.
from multiprocessing import Queue, Process
from cProfile import Profile
from pstats import Stats
import sys
def isprime(x):
for d in range(2, int(x**.5)):
if x % d == 0:
return False
return True
def foo(retq):
p = Profile()
p.enable()
primes = []
max_n = 2**20
for n in range(3, max_n):
if isprime(n):
primes.append(n)
p.disable()
retq.put(Stats(p, stream="dummy")) #Dirty hack: set `stream` to something picklable then override later
if __name__ == "__main__":
q = Queue()
p1 = Process(target=foo, args=(q,))
p1.start()
p2 = Process(target=foo, args=(q,))
p2.start()
s1 = q.get()
s1.stream = sys.stdout #restore original file
s2 = q.get()
# s2.stream #if we are just adding this `Stats` object to another the `stream` just gets thrown away anyway.
s1.add(s2) #add up the stats from both child processes.
s1.print_stats() #s1.stream gets used here, but not before. If you provide a file to write to instead of sys.stdout, it will write to that file)
p1.join()
p2.join()
Related
I need to process-parallelize some computations that are done several time.
So the subprocess python function has to keep alive between two calls.
In a perfect world I would need something like that:
class Computer:
def __init__(self, x):
self.x = x
# Creation of quite heavy python objects that cannot be pickled !!
def call(self, y):
return x+y
process = Computer(4) ## NEED MAGIC HERE to keep "call" alive in a subprocess !!
print(process.call(1)) # prints 5 (=4+1)
print(process.call(12)) # prints 16 (=4+12)
I can follow this answer and communicate via asyncio.subprocess.PIPE, but in my actual use case,
the call argument is a list of list of integers
the call answer is a list of strings
Thus it could be cool to avoid to serialize/deserialize the arguments and return values by hand.
Any ideas of how to keep the function call "alive" and ready to receive new calls ?
Here is an answer, based on this one, but
several subprocesses are created
each subprocess has its own identifier
their calls are parallelized
a small layer to allow exchange of jsons instead of plain byte strings.
hello.py
#!/usr/bin/python3
# This is the taks to be done.
# A task consist in receiving a json assumed to be
# {"vector": [...]}
# and return a json with the length of the vector and
# the worker id.
import sys
import time
import json
ident = sys.argv[1]
while True:
str_data = input()
data = json.loads(str_data)
command = data.get("command", None)
if command == "quit":
answer = {"comment": "I'm leaving",
"my id": ident}
print(json.dumps(answer), end="\n")
sys.exit(1)
time.sleep(1) # simulates 1s of heavy work
answer = {"size": len(data['vector']),
"my id": ident}
print(json.dumps(answer), end="\n")
main.py
#!/usr/bin/python3
import json
from subprocess import Popen, PIPE
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
dprint = print
def create_proc(arg):
cmd = ["./hello.py", arg]
process = Popen(cmd, stdin=PIPE, stdout=PIPE)
return process
def make_call(proc, arg):
"""Make the call in a thread."""
str_arg = json.dumps(arg)
txt = bytes(str_arg + '\n', encoding='utf8')
proc.stdin.write(txt)
proc.stdin.flush()
b_ans = proc.stdout.readline()
s_ans = b_ans.decode('utf8')
j_ans = json.loads(s_ans)
return j_ans
def search(executor, procs, data):
jobs = [executor.submit(make_call, proc, data) for proc in procs]
answer = []
for job in concurrent.futures.as_completed(jobs):
got_ans = job.result()
answer.append(got_ans)
return answer
def main():
n_workers = 50
idents = [f"{i}st" for i in range(0, n_workers)]
executor = ThreadPoolExecutor(n_workers)
# Create `n_workers` subprocesses waiting for data to work with.
# The subprocesses are all different because they receive different
# "initialization" id.
procs = [create_proc(ident) for ident in idents]
data = {"vector": [1, 2, 23]}
answers = search(executor, procs, data) # takes 1s instead of 5 !
for answer in answers:
print(answers)
search(executor, procs, {"command": "quit"})
main()
i have the following:
from multiprocessing import Pool
def process_elements(index_of_data_inputs):
<process>
if <condition>:
# i would like to change the size of data_inputs
if __name__ == '__main__':
pool = Pool() # Create a multiprocessing Pool
pool.map(process_elements, range(0, len(data_inputs)) # process data_inputs iterable with pool
how i can change the size of data_inputs and so change the number of times process_elements
is called?
the work behind that i would like to parallelize is:
i = 0
while i < len(elements):
new_elems = process_some_elements(x,y)
if len(new_elems) > 0:
elements = elements + new_elems
i += 1
Consider simple example of communication between processes with multiprocessing module in Python:
import multiprocessing
import queue
import random
def process_elements(num, comq):
val = random.random()
if val > 0.5:
comq.put(1)
return num, int(1000 * val)
if __name__ == '__main__':
# initial data
numbers = list(range(10))
# data structure fot communication between multiple processes
m = multiprocessing.Manager()
q = m.Queue()
with multiprocessing.Pool(processes=4) as pool:
# get answer for original data
ans = pool.starmap(process_elements, [(num, q) for num in numbers])
print(numbers)
print(ans)
# create additional data based on the answer for initial data
new_numbers = numbers[-1:]
try:
while True:
new_numbers.append(new_numbers[-1] + q.get_nowait())
except queue.Empty:
pass
# get answer for additional data
new_ans = pool.starmap(process_elements, [(num, q) for num in new_numbers[1:]])
print(new_numbers)
print(new_ans)
I have a big list of numbers. I want to split that big list of numbers into x number of lists and process them in parallel.
Here's the code that I have so far:
from multiprocessing import Pool
import numpy
def processNumList(numList):
for num in numList:
outputList.append(num ** 2)
numThreads = 5
bigNumList = list(range(50))
splitNumLists = numpy.array_split(bigNumList, numThreads)
outputList = []
for numList in splitNumLists:
processNumList(numList)
print(outputList)
The above code does the following:
Splits a big list of numbers into the specified number of smaller lists
Passes each of those lists to the processNumList function
Prints the result list afterwards
Everything there works as expected, but it only processes one list at a time. I want every list to be processed simultaneously.
What is the proper code to do that? I experimented with pool but could never seem to get it working.
You could try something like this:
import threading
class MyClass(threading.Thread):
def __init__(self):
# init stuff
def run(self, arg, arg2):
# your logic to process the list
# split the list as you already did
for _ in range(numThreads):
MyThread(arg, arg2).start()
Here's the code I ended up using.
I used threading.Thread() to process the lists asynchronously and then called thread.join() to ensure that all of the threads were finished before moving on.
I added time.sleep for demonstration purposes (to simulate a lengthy task), but obviously you wouldn't want to use that in production code.
import numpy
import threading
import time
def process_num_list(numList):
for num in numList:
output_list.append(num ** 2)
time.sleep(1)
num_threads = 5
big_num_list = list(range(30))
split_num_lists = numpy.array_split(big_num_list, num_threads)
output_list = []
threads = []
for num_list in split_num_lists:
thread = threading.Thread(target=process_num_list, args=[num_list])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
print(output_list)
As a bonus, here's a working example of five Selenium windows:
from selenium import webdriver
import numpy
import threading
import time
def scrapeSites(siteList):
print("Preparing to scrape " + str(len(siteList)) + " sites")
driver = webdriver.Chrome(executable_path = r"..\chromedriver.exe")
driver.set_window_size(700, 400)
for site in siteList:
print("\nNow scraping " + site)
driver.get(site)
pageTitles.append(driver.title)
driver.quit()
numThreads = 5
fullWebsiteList = ["https://en.wikipedia.org/wiki/Special:Random"] * 30
splitWebsiteLists = numpy.array_split(fullWebsiteList, numThreads)
pageTitles = []
threads = []
for websiteList in splitWebsiteLists:
thread = threading.Thread(target=scrapeSites, args=[websiteList])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
print(pageTitles)
I spent nearly the whole day with this and came to the end of my knowledge:
I want to change a shared multiprocessing.Value string in the subprocess, but python hangs as soon as the subprocess is trying to change the shared value.
Below an example code:
from multiprocessing import Process, Value, freeze_support
from ctypes import c_wchar_p
def test(x):
with x.get_lock():
x.value = 'THE TEST WORKED'
return
if __name__ == "__main__":
freeze_support()
value = Value(c_wchar_p, '')
p = Process(target=test, args = (value,))
p.start()
print(p.pid)
# this try block is to also allow p.run()
try:
p.join()
p.terminate()
except:
pass
print(value.value)
What I tried and does not work:
I tried ctypes c_wchar_p and c_char_p, but both result in the same freezing.
I tried also without x.get_lock()
I tried also without freeze_support()
What works (but does not help):
Using a float as the shared value (value = Value('d',0) and x.value = 1).
Running the Process without starting a subprocess (replace p.start() with p.run() )
I am using Windows 10 64 bit and Python 3.6.4 (Spyder, but also tried outside of Spyder).
Any help welcome!
A shared pointer won't work in another process because the pointer is only valid in the process in which it was created. Instead, use an array:
import multiprocessing as mp
def test(x):
x.value = b'Test worked!'
if __name__ == "__main__":
x = mp.Array('c',15)
p = mp.Process(target=test, args = (x,))
p.start()
p.join()
print(x.value)
Output:
b'Test worked!'
Note that array type 'c' is specialized and returns a SynchronizedString vs. other types that return SynchronizedArray. Here's how to use type 'u' for example:
import multiprocessing as mp
from ctypes import *
def test(x):
x.get_obj().value = 'Test worked!'
if __name__ == "__main__":
x = mp.Array('u',15)
p = mp.Process(target=test, args = (x,))
p.start()
p.join()
print(x.get_obj().value)
Output:
Test worked!
Note that operations on the wrapped value that are non-atomic such as += that do read/modify/write should be protected with a with x.get_lock(): context manager.
I am following the principles laid down in this post to safely output the results which will eventually be written to a file. Unfortunately, the code only print 1 and 2, and not 3 to 6.
import os
import argparse
import pandas as pd
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist):
for par in parlist:
queue.put(par)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
try:
par=queueIn.get(block=False)
res=doCalculation(par)
queueOut.put((res))
queueIn.task_done()
except:
break
def doCalculation(par):
return par
def write(queue):
while True:
try:
par=queue.get(block=False)
print("response:",par)
except:
break
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
On running the code it prints,
$ python3 tst.py
Queue size 6
response: 1
response: 2
Also, is it possible to ensure that the write function always outputs 1,2,3,4,5,6 i.e. in the same order in which the data is fed into the feed queue?
The error is somehow with the task_done() call. If you remove that one, then it works, don't ask me why (IMO that's a bug). But the way it works then is that the queueIn.get(block=False) call throws an exception because the queue is empty. This might be just enough for your use case, a better way though would be to use sentinels (as suggested in the multiprocessing docs, see last example). Here's a little rewrite so your program uses sentinels:
import os
import argparse
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist, nthreads):
for par in parlist:
queue.put(par)
for i in range(nthreads):
queue.put(None)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
par=queueIn.get()
if par is None:
break
res=doCalculation(par)
queueOut.put((res))
def doCalculation(par):
return par
def write(queue):
while not queue.empty():
par=queue.get()
print("response:",par)
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod, nthreads))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
A few things to note:
the sentinel is putting a None into the queue. Note that you need one sentinel for every worker process.
for the write function you don't need to do the sentinel handling as there's only one process and you don't need to handle concurrency (if you would do the empty() and then get() thingie in your calc function you would run into a problem if e.g. there's only one item left in the queue and both workers check empty() at the same time and then both want to do get() and then one of them is locked forever)
you don't need to put feed and write into processes, just put them into your main function as you don't want to run it in parallel anyway.
how can I have the same order in output as in input? [...] I guess multiprocessing.map can do this
Yes map keeps the order. Rewriting your program into something simpler (as you don't need the workerQueue and writerQueue and adding random sleeps to prove that the output is still in order:
from multiprocessing import Pool
import time
import random
def calc(val):
time.sleep(random.random())
return val
if __name__ == "__main__":
considerperiod=[1,2,3,4,5,6]
with Pool(processes=2) as pool:
print(pool.map(calc, considerperiod))