ThreadPoolExecutor keep waiting when exception happens - python-3.x

from asyncio import FIRST_EXCEPTION
from concurrent.futures.thread import ThreadPoolExecutor
from queue import Queue
from concurrent.futures import wait
import os
def worker(i: int, in_queue: Queue) -> None:
while 1:
data = in_queue.get()
if data is None:
in_queue.put(data)
print(f'worker {i} exit')
return
print(os.path.exists(data))
def main():
with ThreadPoolExecutor(max_workers=2) as executor:
queue = Queue(maxsize=2)
workers = [executor.submit(worker, i, queue) for i in range(2)]
for obj in [{'fn': '/path/to/sth'}, {}]:
fn = obj['fn'] # here is the exception
queue.put((fn,))
queue.put(None)
done, error = wait(workers, return_when=FIRST_EXCEPTION)
print(done, error)
main()
This program get stuck when exception happens.
From the log:
Traceback (most recent call last):
File "test.py", line 34, in <module>
main()
File "test.py", line 31, in main
print(done, error)
File "/.pyenv/versions/3.7.4/lib/python3.7/concurrent/futures/_base.py", line 623, in __exit__
self.shutdown(wait=True)
File "/.pyenv/versions/3.7.4/lib/python3.7/concurrent/futures/thread.py", line 216, in shutdown
t.join()
File "/.pyenv/versions/3.7.4/lib/python3.7/threading.py", line 1044, in join
self._wait_for_tstate_lock()
File "/.pyenv/versions/3.7.4/lib/python3.7/threading.py", line 1060, in _wait_for_tstate_lock
elif lock.acquire(block, timeout):
KeyboardInterrupt
It happens because wait function keep locked, but it's weird because the exception happens before the wait function. It should be returned when exception happens!!
Why it doesn't return immediately when exception happens?

Yes, I find the reason. When the exception happens, executor will exit, and in with statement, it calls self.shutdown(wait=True), so main thread waits for sub thread to exit, however sub thread keep running.
So the solution is that, shutdown the executor manually with wait=Falseļ¼š
try:
## code here
except Exception as e:
traceback.print_exc()
executor.shutdown(False)

Related

Why shouldn't Event instance be put into Queue in python multiple process?

Why shouldn't Event instance be put to Queue in pyton multiple process?
When I try to put Event instance into Queue, and then the python interceptor raise Runtime Error as below!
RuntimeError: Condition objects should only be shared between processes through inheritance
My Example Code:
import time
from multiprocessing import Process, Queue, Event
def slaver(q: Queue, e:Event):
while True:
print("do1", e)
_, _ = q.get(block=True)
time.sleep(3)
e.set()
print("do2")
def start():
q = Queue()
e = Event()
p = Process(target=slaver, args=(q, e))
p.start()
while True:
print("1")
q.put((1, e))
print("2", e)
wait = e.wait(timeout=1)
print("3", wait)
e.clear()
print("4")
time.sleep(5)
if __name__ == '__main__':
start()
Output
1
2 <multiprocessing.synchronize.Event object at 0x1028d8df0>
Traceback (most recent call last):
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/queues.py", line 239, in _feed
obj = _ForkingPickler.dumps(obj)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/synchronize.py", line 220, in __getstate__
context.assert_spawning(self)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/context.py", line 359, in assert_spawning
raise RuntimeError(
RuntimeError: Condition objects should only be shared between processes through inheritance
do1 <multiprocessing.synchronize.Event object at 0x1075b6eb0>
3 False
4
1
2 <multiprocessing.synchronize.Event object at 0x1028d8df0>
Traceback (most recent call last):
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/queues.py", line 239, in _feed
obj = _ForkingPickler.dumps(obj)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/synchronize.py", line 220, in __getstate__
context.assert_spawning(self)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/multiprocessing/context.py", line 359, in assert_spawning
raise RuntimeError(
RuntimeError: Condition objects should only be shared between processes through inheritance
And if I replate q.put((1, e)) with q.put((1, 2)), the exception will disappear.
But in , there is an example for using Event in multiple threading. the different is my code is in process. Event in process is clone from threading, what's the different?

Python Multiprocessing - 'map_async' RunTimeError issue handling

Here is a sample code which I'm running.
def function(elem):
var1 = elem[0]
var2 = elem[1]
length = nx.shortest_path_length(G, source=var1, target=var2)
return length
p = mp.Pool(processes=4)
results = p.map_async(function, iterable=elements)
track_job(results)
p.close()
p.join()
The error which I'm facing is -
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/usr/local/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/multiprocessing/managers.py", line 192, in accepter
t.start()
File "/usr/local/lib/python3.8/threading.py", line 852, in start
_start_new_thread(self._bootstrap, ())
RuntimeError: can't start new thread
Post this error entire process is halted/paused.
Two Questions:
Is this issue at the hardware level? Or can this be avoided?
How to handle this error, so that the other processes are still running? Also retry the faulty process?
TIA
I typically do stuff like this, maybe it will work for you:
import psutil
p = mp.Pool(psutil.cpu_count(logical=False))
with Pool((psutil.cpu_count()-1) or 1) as p:
try:
results = [r for r in p.map_async(function, elements)]
except Exception as e1:
print(f'{e1}')
finally:
p.close()
p.join()
For the track_job(results), you may want to multiprocess that as well depending on what it's doing.
The try / except / finally error handling could be used to help your program continue when it hits any exception as well, using Exception as the base exception case.

Google PubSub - restarting subscription after exception raised

Good day,
I am running some long-running async jobs using PubSub to trigger a function. Occasionally, the task may fail. In such cases, I simply want to log the exception, acknowledge the message, and restart the subscription to ensure that the subscriber is still pulling new messages after the failure has occurred.
I have placed some simplified code to demonstrate my current set up below:
try:
while True:
streaming_pull_future = workers.subscriber.subscribe(
subscription_path, callback=worker_task <- includes logic to ack() the message if it's failed before
)
print(f'Listening for messages on {subscription_path}')
try:
streaming_pull_future.result()
except Exception as e:
print(streaming_pull_future.cancelled()) #<-- this evaluates to false
streaming_pull_future.cancel() #<-- this results in RunTimeError: set_result can only be called once.
print(e)
except KeyboardInterrupt: # seems to be an issue as per Github PubSub issue #17. No keyboard interrupt
streaming_pull_future.cancel()
I keep seeing a RuntimeError: set_result can only be called oncewhen I execute the streaming_pull_future.cancel() in the exception handler. I checked whether perhaps the subscriber had already been cancelled but when I logged out the status it evaluated to False. Yet when I then call the cancel() method I get the error. I want to ensure that any threads are cleaned up before making a new subscription in the case where I could have several errors. Does anyone know why this is happening and a way around it?
I am running Python 3.7.4 with PubSub 1.2.0 and grpcio 1.27.1.
Update:
As per comments, please see a reproducible example. The stack trace raised is included:
Listening for messages on projects/trigger-web-app/subscriptions/load-job-sub
968432700946405
Top-level exception occurred in callback while processing a message
Traceback (most recent call last):
File "C:\..\lib\site-packages\google\cloud\pubsub_v1\subscriber\_protocol\streaming_pull_manager.py", line
71, in _wrap_callback_errors
callback(message)
File "test.py", line 19, in worker_task
a = 1/0 # cause an exception to be raised
ZeroDivisionError: division by zero
968424309156485
Top-level exception occurred in callback while processing a message
Traceback (most recent call last):
File "C:\...\lib\site-packages\google\cloud\pubsub_v1\subscriber\_protocol\streaming_pull_manager.py", line
71, in _wrap_callback_errors
callback(message)
File "test.py", line 19, in worker_task
a = 1/0 # cause an exception to be raised
ZeroDivisionError: division by zero
Traceback (most recent call last):
File "test.py", line 29, in main
streaming_pull_future.result()
File "C:...\lib\site-packages\google\cloud\pubsub_v1\futures.py", line 105, in result
raise err
File "C:\...\lib\site-packages\google\cloud\pubsub_v1\subscriber\_protocol\streaming_pull_manager.py", line
71, in _wrap_callback_errors
callback(message)
File "test.py", line 19, in worker_task
a = 1/0 # cause an exception to be raised
ZeroDivisionError: division by zero
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "test.py", line 35, in <module>
main()
File "test.py", line 31, in main
streaming_pull_future.cancel()
File "C:\...\lib\site-packages\google\cloud\pubsub_v1\subscriber\futures.py", line 46, in cancel
return self._manager.close()
File "C:\...\lib\site-packages\google\cloud\pubsub_v1\subscriber\_protocol\streaming_pull_manager.py", line
496, in close
callback(self, reason)
File "C:\...\lib\site-packages\google\cloud\pubsub_v1\subscriber\futures.py", line 37, in _on_close_callback
self.set_result(True)
File "C:\...\lib\site-packages\google\cloud\pubsub_v1\futures.py", line 155, in set_result
raise RuntimeError("set_result can only be called once.")
RuntimeError: set_result can only be called once.
import os
from google.cloud import pubsub_v1
subscriber = pubsub_v1.SubscriberClient()
project_id=os.environ['GOOGLE_CLOUD_PROJECT']
subscription_name=os.environ['GOOGLE_CLOUD_PUBSUB_SUBSCRIPTION_NAME']
subscription_path = f'projects/{project_id}/subscriptions/{subscription_name}'
def worker_task( message ):
job_id = message.message_id
print(job_id)
a = 1/0 # cause an exception to be raised
message.ack()
def main():
streaming_pull_future = subscriber.subscribe(
subscription_path, callback=worker_task
)
print(f'Listening for messages on {subscription_path}')
try:
streaming_pull_future.result()
except Exception as e:
streaming_pull_future.cancel() # if exception in callback handler, this will raise a RunTimError
print(e)
if __name__ == '__main__':
main()
Thank you.

How do I catch an exception that occured within the target of a Process?

I am creating a process to execute a function. If the function raises an exception I am not able to catch it. The following is a sample code.
from multiprocessing import Process
import traceback
import time
class CustomTimeoutException(Exception):
pass
def temp1():
time.sleep(5)
print('In temp1')
raise Exception('I have raised an exception')
def main():
try:
p = Process(target=temp1)
p.start()
p.join(10)
if p.is_alive():
p.terminate()
raise CustomTimeoutException('Timeout')
except CustomTimeoutException as e:
print('in Custom')
print(e)
except Exception as e:
print('In exception')
print(e)
if __name__ == "__main__":
main()
When I run the above code the exception raised within temp1 does not get caught. Below is the sample output
In temp1
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "temp.py", line 12, in temp1
raise Exception('I have raised an exception')
Exception: I have raised an exception
I have also tried overwriting the run method of the Process class as mentioned in https://stackoverflow.com/a/33599967/9971556 Wasn't very helpful.
Expected output:
In exception
I have raised an exception

python hangs even with exception handling

I've got a raspberry PI attached to a MCP3008 ADC which is measuring an analog voltage across a thermistor. I'm using the gpiozero python library for communication between the PI and ADC. My code below runs for several minutes and then spits out an error, and then hangs on function get_temp_percent. That function returns the average of five measurements from the ADC. I'm using Signal to throw an exception after 1 second of waiting to try to get past the hang, but it just throws an error and hangs anyway. It looks like nothing in my except statement is being read. Why am I not escaping the code hang?
import time
from gpiozero import MCP3008
from math import log
import pymysql.cursors
from datetime import datetime as dt
import signal
import os
def handler(signum, frame):
print('Signal handler called with signal', signum, frame)
raise Exception("Something went wrong!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
def get_temp_percent(pos=0):
x=[]
for i in range(0,5):
while True:
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(1)
adc = MCP3008(pos)
x.append(adc.value)
#adc.close()
except Exception as inst:
print('get_temp_percent {}'.format(inst) )
signal.alarm(0)
continue
break
signal.alarm(0)
time.sleep(.1)
return round(sum(x)/len(x),5)
def write_date(temp0):
<writes temp0 to mysql db >
# Connect to the database
connection = pymysql.connect(host='', user='', password='', db='',cursorclass = pymysql.cursors.DictCursor)
while True:
temp_percent = get_temp_percent()
print('Temp Percent = {}'.format(temp_percent) )
<some function that do some arithmetic to go temp_percent to temp0>
write_date(temp0)
print('Data Written')
time.sleep(1)
print('Sleep time over')
print('')
Function get_temp_percent causes the problem below
Signal handler called with signal 14 <frame object at 0x76274800>
Exception ignored in: <bound method SharedMixin.__del__ of SPI(closed)>
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/gpiozero/mixins.py", line 137, in __del__
super(SharedMixin, self).__del__()
File "/usr/lib/python3/dist-packages/gpiozero/devices.py", line 122, in __del__
self.close()
File "/usr/lib/python3/dist-packages/gpiozero/devices.py", line 82, in close
old_close()
File "/usr/lib/python3/dist-packages/gpiozero/pins/local.py", line 102, in close
self.pin_factory.release_all(self)
File "/usr/lib/python3/dist-packages/gpiozero/pins/__init__.py", line 85, in release_all
with self._res_lock:
File "/home/pi/Desktop/testing exceptions.py", line 13, in handler
raise Exception("Something went wrong!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
Exception: Something went wrong!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
It looks like your call to gpiozero does a lot of work behind the scenes.
When your exception is processed, the library is trying to clean up and gets stuck.
I took a quick look at the docs for the library and it looks like you may be able to keep hold of the pins so you can re-use them.
e.g.
import ...
adcs = {}
def get_adc_value(pos):
if pos not in adcs:
adcs[pos] = MCP3008(pos)
return adcs[pos].value
def get_temp_percent(pos=0):
x = []
for i in range(0, 5):
x.append(get_adc_value(pos))
time.sleep(.1)
return round(sum(x)/len(x),5)
while True:
temp_percent = get_temp_percent()
...

Resources