BrokenProcessPool: A process in the process pool was terminated abruptly [Spyder] - python-3.x

I'm running the following code in Python 3.7 using Spyder 3.3.2.
If i run the code manualy (ie: in Spyder Editor I select the library import en press "F9", then I select the function I want to run and press "F9") it works.
But if I press "Play" in the tool bar to run the complet file it doesn't work, and I get :
BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending
I'm aware of the problem in Spyder and i'm already running in the current WD. I've also used the trick of the if __name__ = __main__: to run my function as main().
The code I'm building is intended to be used by people not familiar with programming, so for them it's much safer to click "Play" than to select and run specific bite of code.
Here's the code, the content of the function doesn't seem to be a problem and paralellize well when working, using 100% of the 8 cores.
If it helps, the biggest file imported is less than 1mb.
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
import numpy as np
from MPO_PROC import N_tab, N_tab_FCT, TYPE, TR
from MPO_FUNC import *
def main():
""" Set the variables for the loop """
alpha = 0.05
alg_type = TYPE
futures = []
e = ProcessPoolExecutor(8)
if alg_type == "O":
F = FFR_ME
elif alg_type == "NO":
F = FFR_ME_NO
else :
raise TypeError("Algortithm type selected does not exist. Must be 'O' or 'NO'")
""" Loop over the different task summarized in the tab 'N_tab' during the MPO_PROC step. """
for task in N_tab["TASK_NUMBER"]:
""" Declare variables N, n , f based on the previous calculations """
N = N_tab_FCT.loc[N_tab_FCT["TASK_NUMBER"] == task, "N_Task"]
n = N_tab_FCT.loc[N_tab_FCT["TASK_NUMBER"] == task, "n_i"]
f = N_tab_FCT.loc[N_tab_FCT["TASK_NUMBER"] == task, "F"]
"""" Implement the function using the concurrent.future module for multiprocessing. """
for Ni, ni, fi in zip(N,n,f):
future = e.submit(F, Ni, ni, fi, alpha)
futures.append(future)
results = [ff.result() for ff in futures]
for i in range(len(results)):
f = int(N_tab_FCT.loc[i, "F"])
N_tab_FCT.loc[i,"LBound"] = results[i][0][f]
N_tab_FCT.loc[i,"UBound"] = results[i][1][f]
N_tab_FCT.loc[i,"ME"] = (N_tab_FCT.loc[i,"UBound"] - N_tab_FCT.loc[i,"LBound"])/\
(2*N_tab_FCT.loc[i,"N_Task"])
N_tab_FCT.loc[i,"FFR"] = (N_tab_FCT.loc[i,"LBound"] + (N_tab_FCT.loc[i,"UBound"] - N_tab_FCT.loc[i,"LBound"])/2)/\
N_tab_FCT.loc[i,"N_Task"]
if __name__ == "__main__":
main()
Thank you.

Related

Why serial code is faster than concurrent.futures in this case?

I am using the following code to process some pictures for my ML project and I would like to parallelize it.
import multiprocessing as mp
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
#parallel call of the function
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
ood_id = ex.map(track_ids, seqs)
if I run the code serially it takes 3.0 minutes but for parallel with concurrent, it takes 3.5 minutes.
can someone please explain why is that? and present a way to solve the problem.
btw, I have 12 cores.
Thanks
Here's a brief example of how one might go about profiling multiprocessing code vs serial execution:
from multiprocessing import Pool
from cProfile import Profile
from pstats import Stats
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
def profile_seq():
p = Profile() #one and only profiler instance
p.enable()
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
p.disable()
return Stats(p), seqs
def track_ids_pr(seq):
p = Profile() #profile the child tasks
p.enable()
retval = track_ids(seq)
p.disable()
return (Stats(p, stream="dummy"), retval)
def profile_parallel():
p = Profile() #profile stuff in the main process
p.enable()
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
retvals = ex.map(track_ids_pr, seqs)
p.disable()
s = Stats(p)
out = []
for ret in retvals:
s.add(ret[0])
out.append(ret[1])
return s, out
if __name__ == "__main__":
stat, retval = profile_parallel()
stat.print_stats()
EDIT: Unfortunately I found out that pstat.Stats objects cannot be used normally with multiprocessing.Queue because it is not pickleable (which is needed for the operation of concurrent.futures). Evidently it normally will store a reference to a file for the purpose of writing statistics to that file, and if none is given, it will by default grab a reference to sys.stdout. We don't actually need that reference however until we actually want to print out the statistics, so we can just give it a temporary value to prevent the pickle error, and then restore an appropriate value later. The following example should be copy-paste-able and run just fine rather than the pseudocode-ish example above.
from multiprocessing import Queue, Process
from cProfile import Profile
from pstats import Stats
import sys
def isprime(x):
for d in range(2, int(x**.5)):
if x % d == 0:
return False
return True
def foo(retq):
p = Profile()
p.enable()
primes = []
max_n = 2**20
for n in range(3, max_n):
if isprime(n):
primes.append(n)
p.disable()
retq.put(Stats(p, stream="dummy")) #Dirty hack: set `stream` to something picklable then override later
if __name__ == "__main__":
q = Queue()
p1 = Process(target=foo, args=(q,))
p1.start()
p2 = Process(target=foo, args=(q,))
p2.start()
s1 = q.get()
s1.stream = sys.stdout #restore original file
s2 = q.get()
# s2.stream #if we are just adding this `Stats` object to another the `stream` just gets thrown away anyway.
s1.add(s2) #add up the stats from both child processes.
s1.print_stats() #s1.stream gets used here, but not before. If you provide a file to write to instead of sys.stdout, it will write to that file)
p1.join()
p2.join()

How to find out how long a search for files will take on python?

So I have a little app that searches for all xml files on my pc, copying the files that have 44 digits as the filename to the "output" folder.
The problem is that the final user needs an indication of the progress and remaining time of the task.
This is the module to copy files:
xml_search.py
import os
import re
from threading import Thread
from datetime import datetime
import time
import shutil
import winsound
os.system('cls')
def get_drives():
response = os.popen("wmic logicaldisk get caption")
list1 = []
t1 = datetime.now()
for line in response.readlines():
line = line.strip("\n")
line = line.strip("\r")
line = line.strip(" ")
if (line == "Caption" or line == ""):
continue
list1.append(line + '\\')
return list1
def search1(drive):
for root, dir, files in os.walk(drive):
for file in files:
if re.match("\d{44}.xml", file):
filename = os.path.join(root, file)
try:
shutil.copy(filename, os.path.join('output', file))
except Exception as e:
pass
def exec_(callback):
t1 = datetime.now()
list2 = [] # empty list is created
list1 = get_drives()
for each in list1:
process1 = Thread(target=search1, args=(each,))
process1.start()
list2.append(process1)
for t in list2:
t.join() # Terminate the threads
t2 = datetime.now()
total = str(t2-t1)
print(total, file=open('times.txt', 'a'), end="\n")
for x in range(3):
winsound.Beep(2000,100)
time.sleep(.1)
callback()
if __name__ == "__main__":
exec_()
The below code uses progressbar library and it shows
indication of the progress and remaining time of the task
import progressbar
from time import sleep
bar = progressbar.ProgressBar(maxval=1120, \
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.ETA()])
bar.start()
for i in range(1120):
bar.update(i+1)
sleep(0.1)
bar.finish()
You would need to add the above modified code to your code.
So in your case, you would need to count the number of files and provide it as input to ProgressBar constructor's maxval argument and remove sleep call.
The suggested solution with progress bar should work with one thread. You would need to figure out how to initiate the progress bar and where to put the updates if you insist to work with multiple threads.
Try to implement a timer decorator like the following:
import time
def mytimer(func):
def wrapper():
t1 = time.time()
result = func()
t2 = time.time()
print(f"The function {func.__name__} was run {t2 - t1} seconds")
return result
return wrapper
#mytimer
def TimeConsumingFunction():
time.sleep(3)
print("Hello timers")
TimeConsumingFunction()
Output:
/usr/bin/python3.7 /home/user/Documents/python-workspace/timers/example.py
Hello timers
The function TimeConsumingFunction was run 3.002610206604004 seconds
Process finished with exit code 0

Multi-Processing to share memory between processes

I am trying to update a variable of a class by calling a function of the class from a different function which is being run on multi-process.
To achieve the desired result, process (p1) needs to update the variable "transaction" and which should get then modified by process (p2)
I tried the below code and I know i should use Multiprocess.value or manager to achieve the desired result and I am not sure of how to do it as my variable to be updated is in another class
Below is the code:
from multiprocessing import Process
from helper import Helper
camsource = ['a','b']
Pros = []
def sub(i):
HC.trail_func(i)
def main():
for i in camsource:
print ("Camera Thread {} Started!".format(i))
p = Process(target=sub, args=(i))
Pros.append(p)
p.start()
# block until all the threads finish (i.e. block until all function_x calls finish)
for t in Pros:
t.join()
if __name__ == "__main__":
HC = Helper()
main()
Here is the helper code:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class Helper():
def __init__(self):
self.transactions = []
def trail_func(self,preview):
if preview == 'a':
self.transactions.append({"Apple":1})
else:
if self.transactions[0]['Apple'] == 1:
self.transactions[0]['Apple'] = self.transactions[0]['Apple'] + 1
print (self.transactions)
Desired Output:
p1:
transactions = {"Apple":1}
p2:
transactions = {"Apple":2}
I've recently released this module that can help you with your code, as all data frames (data models that can hold any type of data), have locks on them, in order to solve concurrency issues. Anyway, take a look at the README file and the examples.
I've made an example here too, if you'd like to check.

Python freezes when accessing string value in subprocess

I spent nearly the whole day with this and came to the end of my knowledge:
I want to change a shared multiprocessing.Value string in the subprocess, but python hangs as soon as the subprocess is trying to change the shared value.
Below an example code:
from multiprocessing import Process, Value, freeze_support
from ctypes import c_wchar_p
def test(x):
with x.get_lock():
x.value = 'THE TEST WORKED'
return
if __name__ == "__main__":
freeze_support()
value = Value(c_wchar_p, '')
p = Process(target=test, args = (value,))
p.start()
print(p.pid)
# this try block is to also allow p.run()
try:
p.join()
p.terminate()
except:
pass
print(value.value)
What I tried and does not work:
I tried ctypes c_wchar_p and c_char_p, but both result in the same freezing.
I tried also without x.get_lock()
I tried also without freeze_support()
What works (but does not help):
Using a float as the shared value (value = Value('d',0) and x.value = 1).
Running the Process without starting a subprocess (replace p.start() with p.run() )
I am using Windows 10 64 bit and Python 3.6.4 (Spyder, but also tried outside of Spyder).
Any help welcome!
A shared pointer won't work in another process because the pointer is only valid in the process in which it was created. Instead, use an array:
import multiprocessing as mp
def test(x):
x.value = b'Test worked!'
if __name__ == "__main__":
x = mp.Array('c',15)
p = mp.Process(target=test, args = (x,))
p.start()
p.join()
print(x.value)
Output:
b'Test worked!'
Note that array type 'c' is specialized and returns a SynchronizedString vs. other types that return SynchronizedArray. Here's how to use type 'u' for example:
import multiprocessing as mp
from ctypes import *
def test(x):
x.get_obj().value = 'Test worked!'
if __name__ == "__main__":
x = mp.Array('u',15)
p = mp.Process(target=test, args = (x,))
p.start()
p.join()
print(x.get_obj().value)
Output:
Test worked!
Note that operations on the wrapped value that are non-atomic such as += that do read/modify/write should be protected with a with x.get_lock(): context manager.

Calling a def from a thread

Does any one know how to call a def form a thread.
Clock Program:
import sys
from tkinter import *
from tkinter import messagebox
from tkinter import filedialog
from time import sleep
import threading
class MyThread ( threading.Thread ):
def mclock(): # function that it can't call
x = 1
z = 0
while x != -1:
Label(mGui,text = str(x) + "second(s)").pack()
x = x+1
sleep(1)
if x == 60:
x = 1
z = z+1
Label(mGui, text= str(z) + " minute(s) has past.").pack()
return
return
MyThread().start()
mGui = Tk()
mGui.geometry("300x200+100+100")
mGui.title("Jono's Clock")
menubar = Menu(mGui)
filemenu = Menu(menubar, tearoff = 0)
filemenu.add_command(label = "Clock",command = mclock) # can't use function
menubar.add_cascade(label = "File",menu = filemenu)
mGui.config(menu = menubar)
mGui.mainloop()
If any one sees any other errors please state. I am also using windows 7 and python 3.3.
There are several syntax errors in the code you've posted, and I'm not sure exactly what you intended with them, so here's an overview of how to run stuff from threads.
If you want your thread to run your own code from a custom thread class, the usual way to do that is to put the code in a method named run, which will be executed automatically when the thread is started:
import threading
class MyThread(threading.Thread):
def run(self):
# do your stuff here
print("Hello World")
MyThread().start()
Alternatively, if you don't need a class, you can create your function at the top level of your module, and pass it as an argument to threading.Thread's constructor:
def my_function():
print("Hello World")
threading.Thread(target=my_function).start()
Note that you often want to keep a reference to the thread object, rather than letting it go as the code above does. This requires you to use two lines to create and then start the thread:
thread = MyThread() # or the alternative version
thread.start()
This lets you later do:
thread.join()
Which ensures that the thread has finished its work.

Resources