Process Multiple onCreated events parallelly in python watchdog - python-3.x

I am trying to detect if any new files are created on a directory ; if created I want to process it (takes 10 minutes to give output), in the mean time other new files would also be created in the folder.
How do i register the watchdog's oncreated with multiprocess such that instead of waiting for one file to be completed it spawns a new process everytime a file it created.
import time
import datetime
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
import multiprocessing as mp
def on_created(event):
print(f"hey, {event.src_path} has been created!")
time.sleep(10)
doProcessing(event.src_path)
print(f"hey for {event.src_path}")
if __name__ == "__main__":
patterns = "*"
ignore_patterns = ""
ignore_directories = False
case_sensitive = True
my_event_handler = PatternMatchingEventHandler(patterns, ignore_patterns, ignore_directories, case_sensitive)
path = "D:\watcher"
go_recursively = True
my_observer = Observer()
my_observer.schedule(my_event_handler, path, recursive=go_recursively)
my_observer.start()
my_event_handler.on_created = on_created
#my_event_handler.on_deleted = on_deleted
#my_event_handler.on_modified = on_modified
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
my_observer.stop()
my_observer.join()
def doProcessing(filename):
print("Processing")

Sorry for so many commented out portions of the code; in essence pool.apply_async(print_func, (event,)) is what helped solve the problem; once events are pushed into the queue; the process_on_load function iterates through the queue and asynchronously runs the print_func.
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 21 22:02:55 2019
#author: 1009758
"""
import os
import time
import datetime
from watchdog.observers import Observer
from watchdog.events import PatternMatchingEventHandler
import multiprocessing as mp
from multiprocessing import Process
from multiprocessing import Queue
import threading
from multiprocessing import Pool
PROCESSES = mp.cpu_count() - 1
NUMBER_OF_TASKS = 10
class FileLoaderWatchdog(PatternMatchingEventHandler):
''' Watches a nominated directory and when a * type file is
moved
'''
def __init__(self, queue, patterns):
PatternMatchingEventHandler.__init__(self, patterns=patterns)
self.queue = queue
def process(self, event):
'''
event.event_type
'modified' | 'created' | 'moved' | 'deleted'
event.is_directory
True | False
event.src_path
path/to/observed/file
'''
self.queue.put(event)
def on_created(self, event):
self.process(event)
now = datetime.datetime.utcnow()
#print(f"hey for {event.src_path}")
print ("{0} -- event {1} off the queue ...".format(now.strftime("%Y/%m/%d %H:%M:%S"), event.src_path))
def print_func(event):
time.sleep(5)
now = datetime.datetime.utcnow()
print ("{0} -- Pulling {1} off the queue ...".format(now.strftime("%Y/%m/%d %H:%M:%S"), event.src_path))
def info(title):
print(title)
print('module name:', __name__)
print('parent process:', os.getppid())
print('process id:', os.getpid())
def process_load_queue(q):
'''This is the worker thread function. It is run as a daemon
threads that only exit when the main thread ends.
Args
==========
q: Queue() object
'''
while True:
if not q.empty():
#mp.set_start_method('spawn')
event = q.get()
pool = Pool(processes=1)
pool.apply_async(print_func, (event,))
##p = Pool(5)
#p.map(print_func,(event,))
#print_func(event)
#info('main line')
#procs = []
#proc = Process(target=print_func, args=(event,))
#procs.append(proc)
#proc.start()
#for proc in procs:
# proc.join()
#print ("{0} -- Pulling {1} off the queue ...".format(now.strftime("%Y/%m/%d %H:%M:%S"), event.src_path))
#time.sleep(5)
# now2 = datetime.datetime.utcnow()
#print ("{0} -- Replying {1} off the queue ...".format(now2.strftime("%Y/%m/%d %H:%M:%S"), event.src_path))
else:
time.sleep(1)
if __name__ == '__main__':
# create queue
watchdog_queue = Queue()
# Set up a worker thread to process database load
# setup watchdog to monitor directory for trigger files
#args = sys.argv[1:]
patt = ["*"]
path_watch = "D:\watcher"
event_handler = FileLoaderWatchdog(watchdog_queue, patterns=patt)
observer = Observer()
observer.schedule(event_handler, path=path_watch)
observer.start()
#pool=Pool(processes = 1)
#pool.apply_async(process_load_queue, (watchdog_queue,))
worker = threading.Thread(target=process_load_queue, args=(watchdog_queue,))
worker.setDaemon(True)
worker.start()
#p = Pool(2)
#p.map(observer,watchdog_queue)
#asyncio.run(main())
try:
while True:
time.sleep(2)
except KeyboardInterrupt:
observer.stop()
observer.join()

Related

Shutdown during recv on python socket

During the execution of this code, it blocks on the join
I have a TCP server running on ("127.0.0.1", 1777) for the test
I tried using directly the socket with recv, but the result is the same
Any idea, why the shutdown on READ doesn't interrupt the read ?
import socket
from threading import Thread
from time import sleep
class Parser(Thread):
rbufsize = 4096
wbufsize = 4096
encoding="utf-8"
new_line = "\n"
def __init__(self):
super().__init__()
self._socket = socket.socket(family=socket.AF_INET, type=socket.SOCK_STREAM)
self._wfile = None
self._rfile = None
def run(self):
self._socket.connect(("127.0.0.1", 1777))
self._rfile = self._socket.makefile('rb', self.rbufsize, encoding=self.encoding, newline=self.new_line)
self._wfile = self._socket.makefile('wb', self.wbufsize, encoding=self.encoding, newline=self.new_line)
while True:
data = self._rfile.readline()
if not data:
break
self._handle_data(data)
self._cleanup()
def _cleanup(self):
"""
Fermeture
"""
if not self._wfile.closed:
try:
self._wfile.flush()
except socket.error:
# A final socket error may have occurred here, such as
# the local error ECONNABORTED.
pass
self._socket.shutdown(socket.SHUT_RDWR)
self._wfile.close()
self._rfile.close()
self._socket.close()
def stop(self):
self._socket.shutdown(socket.SHUT_RD)
if __name__ == "__main__":
p = Parser()
p.start()
sleep(5)
p.stop()
print("start join")
p.join()

Script Multiprocessing dont finish all task, and also i get 100 cpu?

i need to ask if part of my script is correct, working "i think fine" but i think really i have somethink wrong, because still i get CPU 100% and so many time dont finish all task but after 50/100 task is like frozen.
Any info how to edit it ? Or Maybe just tell me where is the error ?
Thank you
Ps. I have inserted all the modules that the script requires and only the part that should be of interest for multiprocessing and also just firt part of the script.
Many Thanks
from __future__ import print_function
import sys
import os
import easygui
import pyautogui as py
import datetime
import pwinput
import json
from collections import Counter
import random
import string
import threading
import subprocess
import multiprocessing
import queue
from multiprocessing import cpu_count
from multiprocessing import Value, Lock, Process, Queue, current_process
import numpy as np
import grequests
import requests
from requests.exceptions import ConnectionError
from requests.exceptions import HTTPError
import time
from time import sleep
number_of_processes = cpu_count()
class Counter(object):
def __init__(self, initval=0):
self.val = Value('i', initval)
self.lock = Lock()
def increment(self):
with self.lock:
self.val.value += 1
def value(self):
with self.lock:
return self.val.value
def updateTitle(number_of_processes,number_of_task,counterhits,counterdone,countersl,countml,username):
while True:
hits = int(counterhits.value())
done = int(counterdone.value())
shtot = int(countersl.value())
maitot = int(countml.value())
remain_scan = number_of_task - hits
elapsed = time.strftime('%H:%M:%S', time.gmtime(time.time() - start))
ctypes.windll.kernel32.SetConsoleTitleW(f'Site Valid For: {number_of_task} | Started: {hits} | Complete: {done} | Remain: {remain_scan} | SL Found: {shtot} | ML Found: {maitot} | Threads: {number_of_processes} | Time elapsed: {elapsed} ! Licensed at: {username}')
sleep(0.3)
def worker_main(tasks_to_do,tasks_finished,counterhits,counterdone,countersl,countml):
while True:
try:
site = tasks_to_do.get_nowait()
if site is None:
break
except Queue.Empty:
break
except Queue.Full:
sleep(0.5)
continue
counterhits.increment()
do_work(site,counterhits,counterdone,countersl,countml)
tasks_finished.put(site + current_process().name)
counterdone.increment()
return True
def main():
global username
number_of_task = int(len(filter_data))
counterhits = Counter(0)
counterdone = Counter(0)
countersl = Counter(0)
countml = Counter(0)
tasks_to_do = Queue()
tasks_finished = Queue()
processes1 = []
prefix = ['http://']
# creating processes
for w in range(number_of_processes):
p1 = Process(target=worker_main, args=(tasks_to_do,tasks_finished,counterhits,counterdone,countersl,countml))
processes1.append(p1)
p1.start()
procs = [Process(target=updateTitle, args=(number_of_processes,number_of_task,counterhits,counterdone,countersl,countml,username), daemon=True) for i in range(1)]
for p in procs: p.start()
for site_il in filter_data:
site_or = site_il.rstrip("\n")
if (site_or.startswith("http://")) :
site_or = site_or.replace("http://","")
elif (site_or.startswith("https://")) :
site_or = site_or.replace("https://","")
site_or = site_or.rstrip()
site_or = site_or.split('/')[0]
if ('www.' in site_or) :
site_or = site_or.replace("www.", "")
sitexx = [sub + site_or for sub in prefix]
for site in sitexx:
tasks_to_do.put(site)
# completing process
for p1 in processes1:
p1.join()
for p in procs: p.join()
# print the output
while not tasks_finished.empty():
print(tasks_finished.get())
os.system('pause>nul')
return True
if __name__ == '__main__':
if sys.platform.startswith('win'):
# On Windows calling this function is necessary.
multiprocessing.freeze_support()
main()

Producer Consumer message sharing not working in multiprocessing

i am trying to run a scenario where i have a producer which is capturing frames from webcam and putting it in a queue.
and then consumer reads image from input queue and does some processing and puts o/p image in outgoing queue.
Issue is, consumer read from queue is not blocking. Ideally it should be, also when it reads value from queue, size is always constant 128, which is wrong. I am sure size of image that I am putting in queue is far greater.
from __future__ import print_function
import multiprocessing
import time
import logging
import sys
import cv2
class Consumer(multiprocessing.Process):
def __init__(self, incoming_q, outgoing_q):
multiprocessing.Process.__init__(self)
self.outgoing_q = outgoing_q
self.incoming_q = incoming_q
def run(self):
proc_name = self.name
print(f"{proc_name} - inside process_feed..starting")
while True:
#print(f"size of incoming_q=>{self.incoming_q.qsize()}")
try:
#print(f"{proc_name} - size of B incoming_q=>{self.incoming_q.qsize()}")
image_np = self.incoming_q.get(True)
size_of_img = sys.getsizeof(image_np)
#print(f"{proc_name} - size of A incoming_q=>{self.incoming_q.qsize()}")
if size_of_img > 128:
print(f"{proc_name} - size image=>{size_of_img}")
time.sleep(1)
self.outgoing_q.put_nowait(image_np)
except:
pass
print("inside process_feed..ending")
class Producer(multiprocessing.Process):
def __init__(self, incoming_q, outgoing_q):
multiprocessing.Process.__init__(self)
self.incoming_q = incoming_q
self.outgoing_q = outgoing_q
def run(self):
proc_name = self.name
print("inside capture_feed")
stream = cv2.VideoCapture(0)
try:
counter = 0
while True:
counter += 1
if counter == 1:
if not self.incoming_q.full():
(grabbed, image_np) = stream.read()
size_of_img = sys.getsizeof(image_np)
print(f"{proc_name}........B.......=>{self.incoming_q.qsize()}")
print(f"{proc_name} - size image=>{size_of_img}")
self.incoming_q.put(image_np)
print(f"{proc_name}........A.......=>{self.incoming_q.qsize()}")
counter = 0
try:
image_np = self.outgoing_q.get_nowait()
logging.info("reading value for o/p")
cv2.imshow('object detection', image_np)
except:
pass
if cv2.waitKey(25) & 0xFF == ord('q'):
break
finally:
stream.release()
cv2.destroyAllWindows()
print("inside capture_feed..ending")
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
stream = cv2.VideoCapture(0)
incoming_q = multiprocessing.Queue(maxsize=100)
outgoing_q = multiprocessing.Queue(maxsize=100)
logging.info("before start of thread")
max_process = 1
processes = []
processes.append(Producer(incoming_q, outgoing_q))
for i in range(max_process):
p = Consumer(incoming_q, outgoing_q)
p.daemon = True
processes.append(p)
logging.info("inside main thread..middle")
for p in processes:
p.start()
logging.info("inside main thread..ending")
logging.info("waiting in main thread too....")
logging.info("waiting in main thread finished....")
for p in processes:
p.join()
logging.info("inside main thread..ended")
I was able to figure out issue with my approach. I missed whole concept of pickle (serialization).
I changed my code to serialize numpy array before writing to queue and deserialize after reading it. Code started working as expected.
also printing 128 as sizeof np array is fine, i was misinterpreting that number.
def serialize_ndarray(arr:np.ndarray):
serialized = pickle.dumps(arr)
return serialized
def deserialize_ndarray(string):
data = pickle.loads(string)
return data

How to use watchdog and do something in the main thread if a folder is modified?

I was looking for a watchdog and I found this great library. I need to fit a DBSCAN model if a file is created in a folder. Joblib is used in the scikit-learn's DBSCAN implementation and joblib doesn't allow to use multiprocessing if the DBSCAN running code is not in the main thread. If I use watchdog, DBSCAN code can't run in the main thread. How can I solve this issue? Below you can find the watchdog script and a simple function to test it. When I run the main_watchdog.py and add a file in the folder where the watchdog is watching, it runs the simple_function.py in the Thread-1. In the mean time, main_watchdog.py runs in the MainThread.
PS: A solution could be starting a subprocess every time calling the simple_function.py but I am afraid that this may cause some issues if multiple files created in the watchdog folder. Imagine receiving 10 or 100 or 10000 files at once...
#main_watchdog.py
import time
import logging
import threading
from watchdog.observers import Observer
from watchdog.events import LoggingEventHandler
from a_function import simple_function
class Event(LoggingEventHandler):
def on_created(self, event):
simple_function(x)
def on_modified(self, event):
simple_function(x)
if __name__ == "__main__":
x = 1
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
# path = sys.argv[1] if len(sys.argv) > 1 else '.'
path = '/path/to/watch/the/folder'
event_handler = Event()
observer = Observer()
observer.schedule(event_handler, path, recursive=False)
observer.start()
try:
while True:
time.sleep(1)
print(threading.current_thread().name)
except KeyboardInterrupt:
observer.stop()
observer.join()
#a_function.py
import threading
def simple_function(x):
x += 1
print(threading.current_thread().name)
print(x)
If I am to understand this problem correctly, you need you business logic to run in the main thread and the observer to run in the background thread.
This can be easily solved by calling the observer thread in the backgound by using the threading library and then passing the value of those events to your function call by way of Queues which are a way of communication between threads.
#main_watchdog.py
import time
import logging
import threading
from watchdog.observers import Observer
from watchdog.events import LoggingEventHandler
from a_function import simple_function
import sys
from queue import Queue
q = Queue()
x = 1
class Event(LoggingEventHandler):
def on_created(self, event):
q.put(x)
def on_modified(self, event):
q.put(x)
def run_observer():
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
path = sys.argv[1] if len(sys.argv) > 1 else '.'
# path = '/path/to/watch/the/folder'
event_handler = Event()
observer = Observer()
observer.schedule(event_handler, path, recursive=False)
observer.start()
while True:
time.sleep(1)
print(threading.currentThread().name)
if __name__ == "__main__":
background_thread = threading.Thread(target=run_observer, args=())
background_thread.daemon = True
background_thread.start()
print('Business logic')
while True:
val = q.get(True)
simple_function(val)
The other function can remain the same.

Mocking REST APIs with Flask_restful using threading

I'm looking to mock a set of REST APIs for some tests. The following main() function works fine (i.e. it returns {"some-data": 1234} as json to the browser when I GET localhost:8099). The issue is it blocks the main thread:
from gevent import monkey, sleep, pywsgi
monkey.patch_all()
import flask
from flask_restful import reqparse, abort, Api, Resource
import queue
import sys
import threading
STUFFS = {"some-data": 1234}
class Stuff(Resource):
def get(self):
return flask.jsonify(STUFFS)
class ControlThread(threading.Thread):
def __init__(self, http_server, stop_event):
threading.Thread.__init__(self)
self.stop_event = stop_event
self.http_server = http_server
self.running = False
def run(self):
try:
while not self.stop_event.is_set():
if not self.running:
self.http_server.start()
self.running = True
sleep(0.001)
except (KeyboardInterrupt, SystemExit):
pass
self.http_server.stop()
class StuffMock:
def __init__(self, port, name=None):
if name is None:
name = __name__
self.app = flask.Flask(name)
self.api = Api(self.app)
self.api.add_resource(Stuff, "/stuff/")
self.stop_event = threading.Event()
self.http_server = pywsgi.WSGIServer(('', port), self.app)
self.serving_thread = ControlThread(self.http_server,
self.stop_event)
self.serving_thread.daemon = True
def start(self):
self.serving_thread.start()
def stop(self):
self.stop_event.set()
self.serving_thread.join()
def main():
mocker = StuffMock(8099)
mocker.start()
try:
while True:
sleep(0.01)
except (KeyboardInterrupt, SystemExit):
mocker.stop()
sys.exit()
if __name__ == "__main__":
main()
Without the sleep() call in the while loop above, nothing resolves. Here is a more succinct usage to demonstrate:
import time
from stuff_mock import StuffMock
mocker = StuffMock(8099)
mocker.start()
while True:
user_text = input("let's do some work on the main thread: ")
# will only resolve the GET request after user input
# (i.e. when the main thread executes this sleep call)
time.sleep(0.1)
if user_text == "q":
break
mocker.stop()
The gevent threading module seems to work differently from the core one. Does anyone have any tips or ideas about what's going on under the hood?
Found that if I switch out threading for multiprocessing (and threading.Thread for multiprocessing.Process), everything works as expected, and I can spin up arbitrary numbers of mockers without blocking.

Resources