How do I sleep for long periods with PyQt threads? - multithreading

I have a number of certain objects which need to run a specific function at specific ever-changing intervals, again and again, until they decide they are done.
For example, one object may need to wait 30 seconds, run, wait 60 seconds, run, wait 10 seconds, run... You get the point, and this could be going on for 30-120 different objects, running the exact same kind of function.
I was thinking that simply having a function that sleeps for the exact interval would solve my problem, but, correct me if I'm wrong, I remembered that thread pools can only run a certain number of threads at any given time (12 for me). How do I get around this limit?
class Thing(object):
def getCurrentPeriod(self):
return random.randint(5, 30) # Some ever changing period of time
def refresh(self):
doThings() # A long running task that is disk and network intensive
def waitRefresh(self):
period = self.getCurrentPeriod()
time.sleep(period) # Wait that period out
self.refresh()
return self.needRefresh()
# Boolean if it needs to restart - Not sure about how to reschedule,
# or specifically where to connect the worker emit when it finishes
# to make sure this *specific* Thing obj gets it's waitRefresh func called again.
class App(QMainWindow):
def __init__(self, *args, **kwargs):
super(MainWindow, self).__init__(*args, **kwargs)
self.threadpool = QThreadPool()
# Add initial objects to pool (other portions of app may add more over time)
for thing in self.acquireThings():
worker = Worker(thing.waitRefresh)
self.threadpool.start(worker)
Doesn't include the WorkerSignals class nor the QRunnable subclass, this example includes what I usually do. The example is tackling the same problem, but in a (most likely) inefficient way.
edit: New example with complete working example of how time.sleep does not pause the thread and allow others to work. I feel that async may be the only implementation, but is there a quick fix so I don't have to alter my entire app?
Here's what it looks like when you try to sleep more than 12 threads.

The ultimate solution came when I decided to actually try the QTimer class. Perhaps there are more optimized solutions, but this one seems to hit all the checkboxes, even if it's worryingly simple.
import random
import time
import traceback
from functools import partial
from PyQt5.QtCore import *
from PyQt5.QtGui import QFont
from PyQt5.QtWidgets import *
class WorkerSignals(QObject):
"""
Represents the signals a Worker can emit.
"""
finished = pyqtSignal()
starting = pyqtSignal(int) # ID of thread
result = pyqtSignal(tuple) # Tuple refresh result, result and ID
class Worker(QRunnable):
"""
A worker designed to tell when it's starting, when it's finished and the result.
Designed to work around Thread.refresh().
"""
def __init__(self, fn, thread_id, *args, **kwargs):
super(Worker, self).__init__()
# Store constructor arguments (re-used for processing)
self.fn = fn
self.id = thread_id
self.args = args
self.kwargs = kwargs
self.signals = WorkerSignals()
#pyqtSlot()
def run(self):
"""
Runs a given method, and emits the result with the Worker's coordinated ID.
"""
try:
self.signals.starting.emit(self.id) # Thread is now finally ready to work.
result = self.fn(*self.args, **self.kwargs) # Refresh Thread!
self.signals.result.emit(result) # Thread is finished, emit result tuple.
except:
traceback.print_exc()
finally:
self.signals.finished.emit() # Done
class Thread(object):
"""
Basic Rules for a Thread Object:
Cannot store the next timestamp on the object (it's a database object, I don't believe it's good practice
to be creating sessions over and over to simply read/write the access time.
ID and Active are allowed as booleans.
"""
i = -1
def __init__(self):
self.id = Thread.nextID()
self.active = True
self.refreshes = 0
def refresh(self) -> tuple:
"""
'Refreshes' a thread. Waits a specific period, then decides whether Thread object should be deactivated or
returned from additional refreshes. Chance of deactivation lowers with each refresh.
:return: The refresh result, a tuple with a boolean and the thread's ID (for identifying it later)
"""
# Represents my SQL Alchemy Model's refresh() function
self.refreshes += 1
time.sleep(random.randint(2, 5))
if random.random() <= max(0.1, 1.0 - ((self.refreshes + 5) / 10)):
self.active = False
return self.active, self.id
#staticmethod
def getRefreshTime() -> float:
"""
Represents the amount of time before a thread should be refreshed.
Should NOT be used to determine whether the thread is still active or not.
:return: The time period that should be waited.
"""
return random.uniform(10, 300)
#staticmethod
def nextID() -> int:
"""
Returns integer thread IDs in sequence to remove possibility of duplicate IDs.
:return: Integer Thread ID
"""
Thread.i += 1
return Thread.i
def __repr__(self):
return f'Thread(id={self.id} active={self.active})'
class MainWindow(QMainWindow):
"""
GUI containing a Label, Button and ListWidget showing all the active sleeping/working threads.
Manages a threadpool, a number of background singleshot timers, etc.
"""
def __init__(self, *args, **kwargs):
super(MainWindow, self).__init__(*args, **kwargs)
# Widgets Setup
layout = QVBoxLayout()
self.list = QListWidget()
self.l = QLabel("Total Active: 0")
self.button = QPushButton("Refresh List")
self.button.pressed.connect(self.refreshList)
self.button.setDisabled(True)
layout.addWidget(self.l)
layout.addWidget(self.button)
layout.addWidget(self.list)
w = QWidget()
w.setLayout(layout)
self.setCentralWidget(w)
self.show()
# Periodically add threads to the pool.
self.poolTimer = QTimer()
self.poolTimer.setInterval(5_000)
self.poolTimer.timeout.connect(self.addThreads)
# Threading Setup
self.threadpool = QThreadPool()
print("Multithreading with maximum %d threads" % self.threadpool.maxThreadCount())
self.active, self.threads = {}, {}
# Add a number of threads to start with.
for _ in range(random.randint(5, 16)):
self.setupThread(Thread())
self.poolTimer.start()
def refreshList(self):
"""
Refreshes the ListWidget in the GUI with all the active/sleeping/working threads.
"""
self.list.clear()
bold = QFont()
bold.setBold(True)
active = 0
for thread in self.threads.values():
item = QListWidgetItem(
f'Thread {thread.id}/{thread.refreshes}')
# Bold a thread if it's working
if self.active[thread.id]:
active += 1
item.setFont(bold)
self.list.addItem(item)
self.l.setText(f'Total Active: {active}/{len(self.threads)}')
def refreshResult(self, result) -> None:
"""
When a thread is finished, the result determines it's next course of action, which is either
to return to the pool again, or delete itself.
:param result: A tuple containing the result (bool) and the connected Thread ID.
"""
self.active[result[1]] = False
if result[0]:
print(f'Restarting Thread {result[1]}')
self.setupThread(self.threads[result[1]]) # Add by ID, which would normally be a database GET
else:
print(f'Thread {result[1]} shutting down.')
del self.active[result[1]]
del self.threads[result[1]]
self.refreshList()
def updateActivity(self, thread_id) -> None:
"""
Connected to the starting signal, helps signal when a thread is actually being refreshed.
:param thread_id: The Thread ID
"""
print(f'Thread {thread_id} is now active/working.')
self.active[thread_id] = True
def refresh(self, thread):
"""
Adds a new worker to the threadpool to be refreshed.
Can't be considered a real start to the thread.refresh function, as the pool has a max of 12 workers at any time.
The 'starting' signal can tell us when a specific thread is actually being refreshed, and is represented
as a Bold element in the list.
:param thread: A thread instance.
"""
print(f'Adding Thread {thread.id} to the pool.')
worker = Worker(thread.refresh, thread_id=thread.id)
worker.signals.result.connect(self.refreshResult)
worker.signals.starting.connect(self.updateActivity)
self.threadpool.start(worker)
# self.active[thread.id] = True
self.refreshList()
def setupThread(self, thread) -> None:
"""
Adds a new timer designated to start a specific thread.
:param thread: A thread instance.
"""
self.active[thread.id] = False
self.threads[thread.id] = thread
t = QTimer()
period = thread.getRefreshTime()
t.singleShot(period * 1000, partial(self.refresh, thread=thread))
print(f'Thread {thread.id} will start in {period} seconds.')
self.refreshList()
def addThreads(self):
"""
Adds a number of threads to the pool. Called automatically every couple seconds.
"""
add = max(0, 30 + random.randint(-5, 5) - len(self.threads))
if add > 0:
print(f'Adding {add} thread{"s" if add > 1 else ""}.')
for _ in range(add):
self.setupThread(Thread())
app = QApplication([])
window = MainWindow()
app.exec_()
When a Thread is requested, a Timer is created and singleShot is fired on an extra function that will add it to the threadpool. This threadpool can handle up to 12 refreshing continious 'refreshing' threads, and signals allow the GUI to update the moment a change is found.
Thousands of 'Thread' objects can be waiting and it seems singleShot is capable of adding them to the pool exactly when they need to be.
Signals help differentiate when a thread is sleeping, working and active (but inactive Thread objects are immediately removed).
The only caveats I can think of with this program is:
1) Can a QThread implementation beat it?
2) What happens to the QTimer once it's singleshot function has executed and fired? Will they be properly GC'd, or can thousands build up in the background consuming resources?

Related

QGraphicsEffect on pyqt, blinking a button

I am building a GUI on python and pyqt.
The GUI has a lot of pushbuttons, generated through class LED, meaning each led has 3 buttons, for an n number of leds.
In a few of the buttons, I want an effect that changes the opacity of the pushbutton, in a loop from 0 to 1 and back again, so it disappears and appears. I need only one process to manage all, so the effect starts at same time for every button and all blink at the same time.
I've managed to achieve that, through qgraphicseffect in a thread, iterating through a list.
The problem is that after a few minutes, the effect stops, although the thread is still running (print(opacity_level)). more pushbuttons with the effect makes even shorter duration. Clicking any button, even others without effect, restarts the gui animation.
My small research in threading on pyqt made me implement this thread manager, although I do not fully understand it.
class WorkerSignals(QtCore.QObject):
finished = QtCore.pyqtSignal()
error = QtCore.pyqtSignal(tuple)
result = QtCore.pyqtSignal(object)
progress = QtCore.pyqtSignal(tuple)
class Worker(QtCore.QRunnable):
'''
Worker thread
Inherits from QRunnable to handler worker thread setup, signals and wrap-up.
'''
def __init__(self, fn, *args, **kwargs):
super(Worker, self).__init__()
# Store constructor arguments (re-used for processing)
self.fn = fn
self.args = args
self.kwargs = kwargs
self.signals = WorkerSignals()
# Add the callback to our kwargs
self.kwargs['progress_callback'] = self.signals.progress
#pyqtSlot()
def run(self):
'''
Initialise the runner function with passed args, kwargs.
'''
# Retrieve args/kwargs here; and fire processing using them
try:
result = self.fn(*self.args, **self.kwargs)
except:
traceback.print_exc()
exctype, value = sys.exc_info()[:2]
self.signals.error.emit((exctype, value, traceback.format_exc()))
else:
self.signals.result.emit(result) # Return the result of the processing
finally:
self.signals.finished.emit() # Done
Next the leds class
class LEDs:
def __init__(self,name,group,frame):
self.opacity_effect = QtWidgets.QGraphicsOpacityEffect()
self.button_auto = QtWidgets.QPushButton()
self.button_auto.setObjectName("button_auto_neutral")
self.button_auto.clicked.connect(lambda state, x=self: self.AutoMode())
def AutoMode(self):
print(self.name,"Automode")
if len(settings.blink) ==0: # start thread only if no previous thread, both thread and
this reference the size of settings.blink, so not ideal.
print("start thread")
settings.ledAutomode()
settings.blink.append(self)
And finally the settings class, which has the thread with the effect performing action. There is a second thread, which handles the icon of the button, accordingly with a timetable.
class Settings:
def __init__(self):
self.blink=[]
def ledAutomode(self):
def blink(progress_callback):
print("opacity")
op_up=[x/100 for x in range(0,101,5)]
op_down=op_up[::-1]; op_down=op_down[1:-1]; opacity=op_up+op_down
while len(self.blink) !=0:
for i in opacity:
print(i)
QtCore.QThread.msleep(80)
for led in self.blink:
led.opacity_effect.setOpacity(i)
def timeCheck(progress_callback):
while len(self.blink) != 0:
QtCore.QThread.msleep(500)
for led in self.blink:
matrix = [v for v in settings.leds_config[led.group][led.name]["Timetable"]]
matrix_time=[]
...
# some code
...
if sum(led_on_time):
led.button_auto.setObjectName("button_auto_on")
led.button_auto.setStyleSheet(ex.stylesheet)
else:
led.button_auto.setObjectName("button_auto_off")
led.button_auto.setStyleSheet(ex.stylesheet)
QtCore.QThread.msleep(int(30000/len(self.blink)))
worker = Worker(blink) # Any other args, kwargs are passed to the run function
ex.threadpool.start(worker)
worker2 = Worker(timeCheck) # Any other args, kwargs are passed to the run function
ex.threadpool.start(worker2)
So, perhaps a limitation on qgraphicseffect, or some problem with the thread (although its keeps printing), or I made some error.
I've read about subclassing the qgraphicseffect but I don't know if that solves the problem.
If anyone has another implementation, always eager to learn.
Grateful for your time.
Widgets are not thread-safe.
They cannot be created nor accessed from external threads. While it "sometimes" works, doing it is wrong and usually leads to unexpected behavior, drawing artifacts and even fatal crash.
That said, you're making the whole process incredibly and unnecessarily convoluted, much more than it should be, most importantly because Qt already provides both timed events (QTimer) and animations.
class FadeButton(QtWidgets.QPushButton):
def __init__(self):
super().__init__()
self.effect = QtWidgets.QGraphicsOpacityEffect(opacity=1.0)
self.setGraphicsEffect(self.effect)
self.animation = QtCore.QPropertyAnimation(self.effect, b'opacity')
self.animation.setStartValue(1.0)
self.animation.setEndValue(0.0)
self.animation.setDuration(1500)
self.animation.finished.connect(self.checkAnimation)
self.clicked.connect(self.startAnimation)
def startAnimation(self):
self.animation.stop()
self.animation.setDirection(self.animation.Forward)
self.animation.start()
def checkAnimation(self):
if not self.animation.value():
self.animation.setDirection(self.animation.Backward)
self.animation.start()
else:
self.animation.setDirection(self.animation.Forward)
If you want to synchronize opacity amongst many widgets, there are various possibilities, but a QVariantAnimation that updates all opacities is probably the easier choice:
class LEDs(QtWidgets.QWidget):
def __init__(self, parent=None):
super().__init__(parent)
layout = QtWidgets.QHBoxLayout(self)
self.animation = QtCore.QVariantAnimation()
self.animation.setStartValue(1.0)
self.animation.setEndValue(0.0)
self.animation.setDuration(1500)
self.animation.valueChanged.connect(self.updateOpacity)
self.animation.finished.connect(self.checkAnimation)
self.buttons = []
for i in range(3):
button = QtWidgets.QPushButton()
self.buttons.append(button)
layout.addWidget(button)
effect = QtWidgets.QGraphicsOpacityEffect(opacity=1.0)
button.setGraphicsEffect(effect)
button.clicked.connect(self.startAnimation)
# ... as above ...
def updateOpacity(self, opacity):
for button in self.buttons:
button.graphicsEffect().setOpacity(opacity)
Note that you shouldn't change the object name of a widget during runtime, and doing it only because you want to update the stylesheet is wrong. You either use a different stylesheet, or you use the property selector:
QPushButton {
/* default state */
background: #ababab;
}
QPushButton[auto_on="true"] {
/* "on" state */
background: #dadada;
}
class FadeButton(QtWidgets.QPushButton):
def __init__(self):
super().__init__()
# ...
self.setProperty('auto_on', False)
def setAuto(self, state):
self.setProperty('auto_on', state)
self.setStyleSheet(self.styleSheet())

Slow multiprocessing when parent object contains large data

Consider the following snippet:
import numpy as np
import multiprocessing as mp
import time
def work_standalone(args):
return 2
class Worker:
def __init__(self):
self.data = np.random.random(size=(10000, 10000))
# leave a trace whenever init is called
with open('rnd-%d' % np.random.randint(100), 'a') as f:
f.write('init called\n')
def work_internal(self, args):
return 2
def _run(self, target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
for res in result:
pass
def run_internal(self):
self._run(self.work_internal)
def run_standalone(self):
self._run(work_standalone)
if __name__ == '__main__':
t1 = time.time()
Worker().run_standalone()
t2 = time.time()
print(f'Standalone took {t2 - t1:.3f} seconds')
t3 = time.time()
Worker().run_internal()
t4 = time.time()
print(f'Internal took {t3 - t4:.3f} seconds')
I.e. we have an object containing a large variable that uses multiprocessing to parallelize some work that has nothing to do with that large variable, i.e. does not read from or write to. The location of the worker process has a huge impact on the runtime:
Standalone took 0.616 seconds
Internal took 19.917 seconds
Why is this happening? I am completely lost. Note that __init__ is only called twice, so the random data is not created for every new process in the pool. The only reason I can think of why this would be slow is that data is copied around, but that would not make sense since it is never used anywhere, and python is supposed to use copy-on-write semantics. Also note that the difference disappears if you make run_internal a static method.
The issue you have is due to the target you are calling from the pool. That target is the function with the reference to Worker instance.
Now, you're right that the __init__() is only called twice. But remember, when you send anything to and from the processes, python will need to pickle the data first.
So, because your target is self.work_internal(), python has to pickle the Worker() instance every time the imap is called. This leads to one issue, self.data being copied over again and again.
The following is the proof. I just added 1 "input" statements, and fixed the last time of time calculation.
import numpy as np
import multiprocessing as mp
import time
def work_standalone(args):
return 2
class Worker:
def __init__(self):
self.data = np.random.random(size=(10000, 10000))
# leave a trace whenever init is called
with open('rnd-%d' % np.random.randint(100), 'a') as f:
f.write('init called\n')
def work_internal(self, args):
return 2
def _run(self, target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
input("Wait for analysis")
for res in result:
pass
def run_internal(self):
self._run(self.work_internal)
# self._run(work_standalone)
def run_standalone(self):
self._run(work_standalone)
def work_internal(target):
with mp.Pool() as pool:
tasks = [[idx] for idx in range(16)]
result = pool.imap(target, tasks)
for res in result:
pass
if __name__ == '__main__':
t1 = time.time()
Worker().run_standalone()
t2 = time.time()
print(f'Standalone took {t2 - t1:.3f} seconds')
t3 = time.time()
Worker().run_internal()
t4 = time.time()
print(f'Internal took {t4 - t3:.3f} seconds')
You can run the code, when it shows up "wait for analysis", go and check the memory usage.
Like so
Then on the second time you see the message, press enter. And observe the memory usage increasing and decreasing again.
On the other hand, if you change self._run(self.work_internal) to self._run(work_standalone) you would notice that the speed is very fast, and the memory is not increasing, as well as the time taken is a lot shorter than doing self.work_internal.
Solution
One way to solve your issue is to set self.data as a static class variable. In normal cases, this would prevent instances from having to copy/reinit the variable again. This also prevented the issue from occuring.
class Worker:
data = np.random.random(size=(10000, 10000))
def __init__(self):
pass
...

Python PyQt5: based on condition, run a CPU intensive QThread [duplicate]

I am trying to figure out why this code crashes if I try to run the threads for a second time once they are completed.
The first time I click "Start 5 Threads" It runs just fine and finishes. But if I click it again. The entire program crashes and I get the QThread: Destroyed while thread is still running Error
This code was found on the web. I am trying to learn from it.
import time
import sys
from PyQt5.QtCore import QObject, QThread, pyqtSignal, pyqtSlot
from PyQt5.QtWidgets import QApplication, QPushButton, QTextEdit, QVBoxLayout, QWidget
def trap_exc_during_debug(*args):
# when app raises uncaught exception, print info
print(args)
# install exception hook: without this, uncaught exception would cause application to exit
sys.excepthook = trap_exc_during_debug
class Worker(QObject):
"""
Must derive from QObject in order to emit signals, connect slots to other signals, and operate in a QThread.
"""
sig_step = pyqtSignal(int, str) # worker id, step description: emitted every step through work() loop
sig_done = pyqtSignal(int) # worker id: emitted at end of work()
sig_msg = pyqtSignal(str) # message to be shown to user
def __init__(self, id: int):
super().__init__()
self.__id = id
self.__abort = False
#pyqtSlot()
def work(self):
"""
Pretend this worker method does work that takes a long time. During this time, the thread's
event loop is blocked, except if the application's processEvents() is called: this gives every
thread (incl. main) a chance to process events, which in this sample means processing signals
received from GUI (such as abort).
"""
thread_name = QThread.currentThread().objectName()
thread_id = int(QThread.currentThreadId()) # cast to int() is necessary
self.sig_msg.emit('Running worker #{} from thread "{}" (#{})'.format(self.__id, thread_name, thread_id))
for step in range(100):
time.sleep(0.1)
self.sig_step.emit(self.__id, 'step ' + str(step))
# check if we need to abort the loop; need to process events to receive signals;
app.processEvents() # this could cause change to self.__abort
if self.__abort:
# note that "step" value will not necessarily be same for every thread
self.sig_msg.emit('Worker #{} aborting work at step {}'.format(self.__id, step))
break
self.sig_done.emit(self.__id)
def abort(self):
self.sig_msg.emit('Worker #{} notified to abort'.format(self.__id))
self.__abort = True
class MyWidget(QWidget):
NUM_THREADS = 5
# sig_start = pyqtSignal() # needed only due to PyCharm debugger bug (!)
sig_abort_workers = pyqtSignal()
def __init__(self):
super().__init__()
self.setWindowTitle("Thread Example")
form_layout = QVBoxLayout()
self.setLayout(form_layout)
self.resize(400, 800)
self.button_start_threads = QPushButton()
self.button_start_threads.clicked.connect(self.start_threads)
self.button_start_threads.setText("Start {} threads".format(self.NUM_THREADS))
form_layout.addWidget(self.button_start_threads)
self.button_stop_threads = QPushButton()
self.button_stop_threads.clicked.connect(self.abort_workers)
self.button_stop_threads.setText("Stop threads")
self.button_stop_threads.setDisabled(True)
form_layout.addWidget(self.button_stop_threads)
self.log = QTextEdit()
form_layout.addWidget(self.log)
self.progress = QTextEdit()
form_layout.addWidget(self.progress)
QThread.currentThread().setObjectName('main') # threads can be named, useful for log output
self.__workers_done = None
self.__threads = None
def start_threads(self):
self.log.append('starting {} threads'.format(self.NUM_THREADS))
self.button_start_threads.setDisabled(True)
self.button_stop_threads.setEnabled(True)
self.__workers_done = 0
self.__threads = []
for idx in range(self.NUM_THREADS):
worker = Worker(idx)
thread = QThread()
thread.setObjectName('thread_' + str(idx))
self.__threads.append((thread, worker)) # need to store worker too otherwise will be gc'd
worker.moveToThread(thread)
# get progress messages from worker:
worker.sig_step.connect(self.on_worker_step)
worker.sig_done.connect(self.on_worker_done)
worker.sig_msg.connect(self.log.append)
# control worker:
self.sig_abort_workers.connect(worker.abort)
# get read to start worker:
# self.sig_start.connect(worker.work) # needed due to PyCharm debugger bug (!); comment out next line
thread.started.connect(worker.work)
thread.start() # this will emit 'started' and start thread's event loop
# self.sig_start.emit() # needed due to PyCharm debugger bug (!)
#pyqtSlot(int, str)
def on_worker_step(self, worker_id: int, data: str):
self.log.append('Worker #{}: {}'.format(worker_id, data))
self.progress.append('{}: {}'.format(worker_id, data))
#pyqtSlot(int)
def on_worker_done(self, worker_id):
self.log.append('worker #{} done'.format(worker_id))
self.progress.append('-- Worker {} DONE'.format(worker_id))
self.__workers_done += 1
if self.__workers_done == self.NUM_THREADS:
self.log.append('No more workers active')
self.button_start_threads.setEnabled(True)
self.button_stop_threads.setDisabled(True)
# self.__threads = None
#pyqtSlot()
def abort_workers(self):
self.sig_abort_workers.emit()
self.log.append('Asking each worker to abort')
for thread, worker in self.__threads: # note nice unpacking by Python, avoids indexing
thread.quit() # this will quit **as soon as thread event loop unblocks**
thread.wait() # <- so you need to wait for it to *actually* quit
# even though threads have exited, there may still be messages on the main thread's
# queue (messages that threads emitted before the abort):
self.log.append('All threads exited')
if __name__ == "__main__":
app = QApplication([])
form = MyWidget()
form.show()
sys.exit(app.exec_())
The problem is solved by passing him as a parent to self. You must change:
thread = QThread()
to:
thread = QThread(parent=self)

Python3: Multiprocessing consumes extensively much RAM and slows down

I start multiple processes in order to create a list of new objects. htop shows me in between 1 and 4 processes (I always create 3 new objects).
def foo(self):
with multiprocessing.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(self.new_obj, self.information)
self.new_objs = result.get()
pool.terminate()
gc.collect()
I call foo() multiple times, each time it is called, the whole process is running slower, the program does not even finish in the end, as it slows down to much. The program starts to eat up all my RAM, while the sequential approach does not have any significant RAM usage.
When I kill the program, most of the time this was the function the program was last executing.
->File "threading.py", line 293, in wait
waiter.acquire()
Edit
To give some information about my circumstances. I create a tree made of nodes. foo() is called by a parent node in order to create its child nodes. The result returned by the processes are these child nodes. Those are saved in a list at the parent node. I want to parallelize the creation of those child nodes instead of creating them in a sequential way.
I think your issue has mainly to do with the fact that your parallelised function is a method of the object. It's hard to be certain without more information, but consider this little toy program:
import multiprocessing as mp
import numpy as np
import gc
class Object(object):
def __init__(self, _):
self.data = np.empty((100, 100, 100), dtype=np.float64)
class Container(object):
def __new__(cls):
self = object.__new__(cls)
print("Born")
return self
def __init__(self):
self.objects = []
def foo(self):
with mp.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(self.new_obj, range(50))
self.objects.extend(result.get())
pool.terminate()
gc.collect()
def new_obj(self, i):
return Object(i)
def __del__(self):
print("Dead")
if __name__ == '__main__':
c = Container()
for j in range(5):
c.foo()
Now Container is called only once, so you'd expect to see a "Born", followed by a "Dead" being printed out; but since the code being executed by the processes is a method of the container, this means the whole container has to be executed elsewhere ! Running this, you will see a stream of intermingled "Born" and "Dead" as your container is being rebuilt on every execution of map:
Born
Born
Born
Born
Born
Dead
Born
Dead
Dead
Born
Dead
Born
...
<MANY MORE LINES HERE>
...
Born
Dead
To convince yourself that the entire container is being copied and sent around every time, try to set some non-serialisable value:
def foo(self):
with mp.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(self.new_obj, range(50))
self.fn = lambda x: x**2
self.objects.extend(result.get())
pool.terminate()
gc.collect()
Which will immediately raise an AttributeError as it cannot serialise the container.
Let's sum up: when sending 1000 requests to the pool, Container will be serialised, sent to the processes and deserialised there a 1000 times. Sure, they will eventually be dropped (assuming there's not too much weird cross-referencing going on), but that will definitely put a lot of pressure on the RAM, as the object is serialised, called, updated, reserialised... for every element in your mapped inputs.
How can you solve that ? Well, ideally, do not share state:
def new_obj(_):
return Object(_)
class Container(object):
def __new__(cls):
self = object.__new__(cls)
print("Born")
return self
def __init__(self):
self.objects = []
def foo(self):
with mp.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(new_obj, range(50))
self.objects.extend(result.get())
pool.terminate()
gc.collect()
def __del__(self):
print("Dead")
This completes in a fraction of the time, and only produces the tiniest blimp on the RAM (as a single Container is ever built). If you need some of the internal state to be passed there, extract it and send just that:
def new_obj(tup):
very_important_state, parameters = tup
return Object(very_important_state=very_important_state,
parameters=parameters)
class Container(object):
def __new__(cls):
self = object.__new__(cls)
print("Born")
return self
def __init__(self):
self.objects = []
def foo(self):
important_state = len(self.objects)
with mp.Pool(processes=3, maxtasksperchild=10) as pool:
result = pool.map_async(new_obj,
((important_state, i) for i in range(50)))
self.objects.extend(result.get())
pool.terminate()
gc.collect()
def __del__(self):
print("Dead")
This has the same behaviour as before. If you absolutely cannot avoid sharing some mutable state between the processes, checkout out the multiprocessing tools for doing that without having to copy everything everywhere everytime.

Threaded result not giving same result as un-threaded result (python)

I have created a program to generate data points of functions that I later plot. The program takes a class which defines the function, creates a data outputting object which when called generates the data to a text file. To make the whole process faster I put the jobs in threads, however when I do, the data generated is not always correct. I have attached a picture to show what I mean:
Here are some of the relevant bits of code:
from queue import Queue
import threading
import time
queueLock = threading.Lock()
workQueue = Queue(10)
def process_data(threadName, q, queue_window, done):
while not done.get():
queueLock.acquire() # check whether or not the queue is locked
if not workQueue.empty():
data = q.get()
# data is the Plot object to be run
queueLock.release()
data.parent_window = queue_window
data.process()
else:
queueLock.release()
time.sleep(1)
class WorkThread(threading.Thread):
def __init__(self, threadID, q, done):
threading.Thread.__init__(self)
self.ID = threadID
self.q = q
self.done = done
def get_qw(self, queue_window):
# gets the queue_window object
self.queue_window = queue_window
def run(self):
# this is called when thread.start() is called
print("Thread {0} started.".format(self.ID))
process_data(self.ID, self.q, self.queue_window, self.done)
print("Thread {0} finished.".format(self.ID))
class Application(Frame):
def __init__(self, etc):
self.threads = []
# does some things
def makeThreads(self):
for i in range(1, int(self.threadNum.get()) +1):
thread = WorkThread(i, workQueue, self.calcsDone)
self.threads.append(thread)
# more code which just processes the function etc, sorts out the gui stuff.
And in a separate class (as I'm using tkinter, so the actual code to get the threads to run is called in a different window) (self.parent is the Application class):
def run_jobs(self):
if self.running == False:
# threads are only initiated when jobs are to be run
self.running = True
self.parent.calcsDone.set(False)
self.parent.threads = [] # just to make sure that it is initially empty, we want new threads each time
self.parent.makeThreads()
self.threads = self.parent.threads
for thread in self.threads:
thread.get_qw(self)
thread.start()
# put the jobs in the workQueue
queueLock.acquire()
for job in self.job_queue:
workQueue.put(job)
queueLock.release()
else:
messagebox.showerror("Error", "Jobs already running")
This is all the code which relates to the threads.
I don't know why when I run the program with multiple threads some data points are incorrect, whilst running it with just 1 single thread the data is all perfect. I tried looking up "threadsafe" processes, but couldn't find anything.
Thanks in advance!

Resources