Using multiprocessing and ProcessPoolExecutor simultaneously - python-3.x

I am trying to create a simple script for python3.5 that can execute heavy computer vision algorithms in parallel. I have created a process by multiprocessing.Process in main process.
Inside that process I create concurrent.futures.ProcessPoolExecutor. Spawned process submits tasks to processPoolExecutor and it works perfectly fine. But when I try to stop and join spawned process it hangs on join.
Also if replace processPoolExecuter to threadPoolExecuter everything works perfectly. What did I miss?
Here is main file:
import multiprocessing as mp
import queue as Queue
import numpy as np
import cv2
from time import sleep
import executer_debug
def worker(queue):
pExecutor = executer_debug.Worker()
pExecutor.set()
while True:
print("-->{}<--".format(pExecutor.get()))
sleep(1)
try:
income = queue.get_nowait()
break
except Queue.Empty:
pass
pExecutor.set()
print("<1>{}<1>".format(pExecutor.get()))
print("<2>{}<2>".format(pExecutor.get()))
def main():
queue = mp.Queue()
currProcess = mp.Process(target = worker, args=(queue,))
currProcess.start()
frame = np.zeros((480,640), dtype=np.uint8)
while True:
cv2.imshow('frame',frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
print("stopped")
queue.put("stop")
currProcess.join()
if __name__ == "__main__":
main()
And here is the second file. Code is very simple just enough to demonstrate the issue.
import collections
from concurrent.futures import ProcessPoolExecutor
from time import sleep
import multiprocessing as mp
def worker():
return 1
class Worker():
def __init__(self):
self.workers_count = 4
self.poolExecutor = ProcessPoolExecutor(max_workers = self.workers_count)
self.executors = collections.deque()
def set(self):
self.executors.append(self.poolExecutor.submit(worker))
def get(self):
if len(self.executors) > 0:
if self.executors[0].done():
return self.executors.popleft().result()
else:
return 0
else:
return -1
Thank you!

Related

Script Multiprocessing dont finish all task, and also i get 100 cpu?

i need to ask if part of my script is correct, working "i think fine" but i think really i have somethink wrong, because still i get CPU 100% and so many time dont finish all task but after 50/100 task is like frozen.
Any info how to edit it ? Or Maybe just tell me where is the error ?
Thank you
Ps. I have inserted all the modules that the script requires and only the part that should be of interest for multiprocessing and also just firt part of the script.
Many Thanks
from __future__ import print_function
import sys
import os
import easygui
import pyautogui as py
import datetime
import pwinput
import json
from collections import Counter
import random
import string
import threading
import subprocess
import multiprocessing
import queue
from multiprocessing import cpu_count
from multiprocessing import Value, Lock, Process, Queue, current_process
import numpy as np
import grequests
import requests
from requests.exceptions import ConnectionError
from requests.exceptions import HTTPError
import time
from time import sleep
number_of_processes = cpu_count()
class Counter(object):
def __init__(self, initval=0):
self.val = Value('i', initval)
self.lock = Lock()
def increment(self):
with self.lock:
self.val.value += 1
def value(self):
with self.lock:
return self.val.value
def updateTitle(number_of_processes,number_of_task,counterhits,counterdone,countersl,countml,username):
while True:
hits = int(counterhits.value())
done = int(counterdone.value())
shtot = int(countersl.value())
maitot = int(countml.value())
remain_scan = number_of_task - hits
elapsed = time.strftime('%H:%M:%S', time.gmtime(time.time() - start))
ctypes.windll.kernel32.SetConsoleTitleW(f'Site Valid For: {number_of_task} | Started: {hits} | Complete: {done} | Remain: {remain_scan} | SL Found: {shtot} | ML Found: {maitot} | Threads: {number_of_processes} | Time elapsed: {elapsed} ! Licensed at: {username}')
sleep(0.3)
def worker_main(tasks_to_do,tasks_finished,counterhits,counterdone,countersl,countml):
while True:
try:
site = tasks_to_do.get_nowait()
if site is None:
break
except Queue.Empty:
break
except Queue.Full:
sleep(0.5)
continue
counterhits.increment()
do_work(site,counterhits,counterdone,countersl,countml)
tasks_finished.put(site + current_process().name)
counterdone.increment()
return True
def main():
global username
number_of_task = int(len(filter_data))
counterhits = Counter(0)
counterdone = Counter(0)
countersl = Counter(0)
countml = Counter(0)
tasks_to_do = Queue()
tasks_finished = Queue()
processes1 = []
prefix = ['http://']
# creating processes
for w in range(number_of_processes):
p1 = Process(target=worker_main, args=(tasks_to_do,tasks_finished,counterhits,counterdone,countersl,countml))
processes1.append(p1)
p1.start()
procs = [Process(target=updateTitle, args=(number_of_processes,number_of_task,counterhits,counterdone,countersl,countml,username), daemon=True) for i in range(1)]
for p in procs: p.start()
for site_il in filter_data:
site_or = site_il.rstrip("\n")
if (site_or.startswith("http://")) :
site_or = site_or.replace("http://","")
elif (site_or.startswith("https://")) :
site_or = site_or.replace("https://","")
site_or = site_or.rstrip()
site_or = site_or.split('/')[0]
if ('www.' in site_or) :
site_or = site_or.replace("www.", "")
sitexx = [sub + site_or for sub in prefix]
for site in sitexx:
tasks_to_do.put(site)
# completing process
for p1 in processes1:
p1.join()
for p in procs: p.join()
# print the output
while not tasks_finished.empty():
print(tasks_finished.get())
os.system('pause>nul')
return True
if __name__ == '__main__':
if sys.platform.startswith('win'):
# On Windows calling this function is necessary.
multiprocessing.freeze_support()
main()

Producer Consumer message sharing not working in multiprocessing

i am trying to run a scenario where i have a producer which is capturing frames from webcam and putting it in a queue.
and then consumer reads image from input queue and does some processing and puts o/p image in outgoing queue.
Issue is, consumer read from queue is not blocking. Ideally it should be, also when it reads value from queue, size is always constant 128, which is wrong. I am sure size of image that I am putting in queue is far greater.
from __future__ import print_function
import multiprocessing
import time
import logging
import sys
import cv2
class Consumer(multiprocessing.Process):
def __init__(self, incoming_q, outgoing_q):
multiprocessing.Process.__init__(self)
self.outgoing_q = outgoing_q
self.incoming_q = incoming_q
def run(self):
proc_name = self.name
print(f"{proc_name} - inside process_feed..starting")
while True:
#print(f"size of incoming_q=>{self.incoming_q.qsize()}")
try:
#print(f"{proc_name} - size of B incoming_q=>{self.incoming_q.qsize()}")
image_np = self.incoming_q.get(True)
size_of_img = sys.getsizeof(image_np)
#print(f"{proc_name} - size of A incoming_q=>{self.incoming_q.qsize()}")
if size_of_img > 128:
print(f"{proc_name} - size image=>{size_of_img}")
time.sleep(1)
self.outgoing_q.put_nowait(image_np)
except:
pass
print("inside process_feed..ending")
class Producer(multiprocessing.Process):
def __init__(self, incoming_q, outgoing_q):
multiprocessing.Process.__init__(self)
self.incoming_q = incoming_q
self.outgoing_q = outgoing_q
def run(self):
proc_name = self.name
print("inside capture_feed")
stream = cv2.VideoCapture(0)
try:
counter = 0
while True:
counter += 1
if counter == 1:
if not self.incoming_q.full():
(grabbed, image_np) = stream.read()
size_of_img = sys.getsizeof(image_np)
print(f"{proc_name}........B.......=>{self.incoming_q.qsize()}")
print(f"{proc_name} - size image=>{size_of_img}")
self.incoming_q.put(image_np)
print(f"{proc_name}........A.......=>{self.incoming_q.qsize()}")
counter = 0
try:
image_np = self.outgoing_q.get_nowait()
logging.info("reading value for o/p")
cv2.imshow('object detection', image_np)
except:
pass
if cv2.waitKey(25) & 0xFF == ord('q'):
break
finally:
stream.release()
cv2.destroyAllWindows()
print("inside capture_feed..ending")
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
stream = cv2.VideoCapture(0)
incoming_q = multiprocessing.Queue(maxsize=100)
outgoing_q = multiprocessing.Queue(maxsize=100)
logging.info("before start of thread")
max_process = 1
processes = []
processes.append(Producer(incoming_q, outgoing_q))
for i in range(max_process):
p = Consumer(incoming_q, outgoing_q)
p.daemon = True
processes.append(p)
logging.info("inside main thread..middle")
for p in processes:
p.start()
logging.info("inside main thread..ending")
logging.info("waiting in main thread too....")
logging.info("waiting in main thread finished....")
for p in processes:
p.join()
logging.info("inside main thread..ended")
I was able to figure out issue with my approach. I missed whole concept of pickle (serialization).
I changed my code to serialize numpy array before writing to queue and deserialize after reading it. Code started working as expected.
also printing 128 as sizeof np array is fine, i was misinterpreting that number.
def serialize_ndarray(arr:np.ndarray):
serialized = pickle.dumps(arr)
return serialized
def deserialize_ndarray(string):
data = pickle.loads(string)
return data

How to use watchdog and do something in the main thread if a folder is modified?

I was looking for a watchdog and I found this great library. I need to fit a DBSCAN model if a file is created in a folder. Joblib is used in the scikit-learn's DBSCAN implementation and joblib doesn't allow to use multiprocessing if the DBSCAN running code is not in the main thread. If I use watchdog, DBSCAN code can't run in the main thread. How can I solve this issue? Below you can find the watchdog script and a simple function to test it. When I run the main_watchdog.py and add a file in the folder where the watchdog is watching, it runs the simple_function.py in the Thread-1. In the mean time, main_watchdog.py runs in the MainThread.
PS: A solution could be starting a subprocess every time calling the simple_function.py but I am afraid that this may cause some issues if multiple files created in the watchdog folder. Imagine receiving 10 or 100 or 10000 files at once...
#main_watchdog.py
import time
import logging
import threading
from watchdog.observers import Observer
from watchdog.events import LoggingEventHandler
from a_function import simple_function
class Event(LoggingEventHandler):
def on_created(self, event):
simple_function(x)
def on_modified(self, event):
simple_function(x)
if __name__ == "__main__":
x = 1
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
# path = sys.argv[1] if len(sys.argv) > 1 else '.'
path = '/path/to/watch/the/folder'
event_handler = Event()
observer = Observer()
observer.schedule(event_handler, path, recursive=False)
observer.start()
try:
while True:
time.sleep(1)
print(threading.current_thread().name)
except KeyboardInterrupt:
observer.stop()
observer.join()
#a_function.py
import threading
def simple_function(x):
x += 1
print(threading.current_thread().name)
print(x)
If I am to understand this problem correctly, you need you business logic to run in the main thread and the observer to run in the background thread.
This can be easily solved by calling the observer thread in the backgound by using the threading library and then passing the value of those events to your function call by way of Queues which are a way of communication between threads.
#main_watchdog.py
import time
import logging
import threading
from watchdog.observers import Observer
from watchdog.events import LoggingEventHandler
from a_function import simple_function
import sys
from queue import Queue
q = Queue()
x = 1
class Event(LoggingEventHandler):
def on_created(self, event):
q.put(x)
def on_modified(self, event):
q.put(x)
def run_observer():
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
path = sys.argv[1] if len(sys.argv) > 1 else '.'
# path = '/path/to/watch/the/folder'
event_handler = Event()
observer = Observer()
observer.schedule(event_handler, path, recursive=False)
observer.start()
while True:
time.sleep(1)
print(threading.currentThread().name)
if __name__ == "__main__":
background_thread = threading.Thread(target=run_observer, args=())
background_thread.daemon = True
background_thread.start()
print('Business logic')
while True:
val = q.get(True)
simple_function(val)
The other function can remain the same.

Mocking REST APIs with Flask_restful using threading

I'm looking to mock a set of REST APIs for some tests. The following main() function works fine (i.e. it returns {"some-data": 1234} as json to the browser when I GET localhost:8099). The issue is it blocks the main thread:
from gevent import monkey, sleep, pywsgi
monkey.patch_all()
import flask
from flask_restful import reqparse, abort, Api, Resource
import queue
import sys
import threading
STUFFS = {"some-data": 1234}
class Stuff(Resource):
def get(self):
return flask.jsonify(STUFFS)
class ControlThread(threading.Thread):
def __init__(self, http_server, stop_event):
threading.Thread.__init__(self)
self.stop_event = stop_event
self.http_server = http_server
self.running = False
def run(self):
try:
while not self.stop_event.is_set():
if not self.running:
self.http_server.start()
self.running = True
sleep(0.001)
except (KeyboardInterrupt, SystemExit):
pass
self.http_server.stop()
class StuffMock:
def __init__(self, port, name=None):
if name is None:
name = __name__
self.app = flask.Flask(name)
self.api = Api(self.app)
self.api.add_resource(Stuff, "/stuff/")
self.stop_event = threading.Event()
self.http_server = pywsgi.WSGIServer(('', port), self.app)
self.serving_thread = ControlThread(self.http_server,
self.stop_event)
self.serving_thread.daemon = True
def start(self):
self.serving_thread.start()
def stop(self):
self.stop_event.set()
self.serving_thread.join()
def main():
mocker = StuffMock(8099)
mocker.start()
try:
while True:
sleep(0.01)
except (KeyboardInterrupt, SystemExit):
mocker.stop()
sys.exit()
if __name__ == "__main__":
main()
Without the sleep() call in the while loop above, nothing resolves. Here is a more succinct usage to demonstrate:
import time
from stuff_mock import StuffMock
mocker = StuffMock(8099)
mocker.start()
while True:
user_text = input("let's do some work on the main thread: ")
# will only resolve the GET request after user input
# (i.e. when the main thread executes this sleep call)
time.sleep(0.1)
if user_text == "q":
break
mocker.stop()
The gevent threading module seems to work differently from the core one. Does anyone have any tips or ideas about what's going on under the hood?
Found that if I switch out threading for multiprocessing (and threading.Thread for multiprocessing.Process), everything works as expected, and I can spin up arbitrary numbers of mockers without blocking.

Python multiprocessing script partial output

I am following the principles laid down in this post to safely output the results which will eventually be written to a file. Unfortunately, the code only print 1 and 2, and not 3 to 6.
import os
import argparse
import pandas as pd
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist):
for par in parlist:
queue.put(par)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
try:
par=queueIn.get(block=False)
res=doCalculation(par)
queueOut.put((res))
queueIn.task_done()
except:
break
def doCalculation(par):
return par
def write(queue):
while True:
try:
par=queue.get(block=False)
print("response:",par)
except:
break
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
On running the code it prints,
$ python3 tst.py
Queue size 6
response: 1
response: 2
Also, is it possible to ensure that the write function always outputs 1,2,3,4,5,6 i.e. in the same order in which the data is fed into the feed queue?
The error is somehow with the task_done() call. If you remove that one, then it works, don't ask me why (IMO that's a bug). But the way it works then is that the queueIn.get(block=False) call throws an exception because the queue is empty. This might be just enough for your use case, a better way though would be to use sentinels (as suggested in the multiprocessing docs, see last example). Here's a little rewrite so your program uses sentinels:
import os
import argparse
import multiprocessing
from multiprocessing import Process, Queue
from time import sleep
def feed(queue, parlist, nthreads):
for par in parlist:
queue.put(par)
for i in range(nthreads):
queue.put(None)
print("Queue size", queue.qsize())
def calc(queueIn, queueOut):
while True:
par=queueIn.get()
if par is None:
break
res=doCalculation(par)
queueOut.put((res))
def doCalculation(par):
return par
def write(queue):
while not queue.empty():
par=queue.get()
print("response:",par)
if __name__ == "__main__":
nthreads = 2
workerQueue = Queue()
writerQueue = Queue()
considerperiod=[1,2,3,4,5,6]
feedProc = Process(target=feed, args=(workerQueue, considerperiod, nthreads))
calcProc = [Process(target=calc, args=(workerQueue, writerQueue)) for i in range(nthreads)]
writProc = Process(target=write, args=(writerQueue,))
feedProc.start()
feedProc.join()
for p in calcProc:
p.start()
for p in calcProc:
p.join()
writProc.start()
writProc.join()
A few things to note:
the sentinel is putting a None into the queue. Note that you need one sentinel for every worker process.
for the write function you don't need to do the sentinel handling as there's only one process and you don't need to handle concurrency (if you would do the empty() and then get() thingie in your calc function you would run into a problem if e.g. there's only one item left in the queue and both workers check empty() at the same time and then both want to do get() and then one of them is locked forever)
you don't need to put feed and write into processes, just put them into your main function as you don't want to run it in parallel anyway.
how can I have the same order in output as in input? [...] I guess multiprocessing.map can do this
Yes map keeps the order. Rewriting your program into something simpler (as you don't need the workerQueue and writerQueue and adding random sleeps to prove that the output is still in order:
from multiprocessing import Pool
import time
import random
def calc(val):
time.sleep(random.random())
return val
if __name__ == "__main__":
considerperiod=[1,2,3,4,5,6]
with Pool(processes=2) as pool:
print(pool.map(calc, considerperiod))

Resources