How to find out how long a search for files will take on python? - python-3.x

So I have a little app that searches for all xml files on my pc, copying the files that have 44 digits as the filename to the "output" folder.
The problem is that the final user needs an indication of the progress and remaining time of the task.
This is the module to copy files:
xml_search.py
import os
import re
from threading import Thread
from datetime import datetime
import time
import shutil
import winsound
os.system('cls')
def get_drives():
response = os.popen("wmic logicaldisk get caption")
list1 = []
t1 = datetime.now()
for line in response.readlines():
line = line.strip("\n")
line = line.strip("\r")
line = line.strip(" ")
if (line == "Caption" or line == ""):
continue
list1.append(line + '\\')
return list1
def search1(drive):
for root, dir, files in os.walk(drive):
for file in files:
if re.match("\d{44}.xml", file):
filename = os.path.join(root, file)
try:
shutil.copy(filename, os.path.join('output', file))
except Exception as e:
pass
def exec_(callback):
t1 = datetime.now()
list2 = [] # empty list is created
list1 = get_drives()
for each in list1:
process1 = Thread(target=search1, args=(each,))
process1.start()
list2.append(process1)
for t in list2:
t.join() # Terminate the threads
t2 = datetime.now()
total = str(t2-t1)
print(total, file=open('times.txt', 'a'), end="\n")
for x in range(3):
winsound.Beep(2000,100)
time.sleep(.1)
callback()
if __name__ == "__main__":
exec_()

The below code uses progressbar library and it shows
indication of the progress and remaining time of the task
import progressbar
from time import sleep
bar = progressbar.ProgressBar(maxval=1120, \
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.ETA()])
bar.start()
for i in range(1120):
bar.update(i+1)
sleep(0.1)
bar.finish()
You would need to add the above modified code to your code.
So in your case, you would need to count the number of files and provide it as input to ProgressBar constructor's maxval argument and remove sleep call.
The suggested solution with progress bar should work with one thread. You would need to figure out how to initiate the progress bar and where to put the updates if you insist to work with multiple threads.

Try to implement a timer decorator like the following:
import time
def mytimer(func):
def wrapper():
t1 = time.time()
result = func()
t2 = time.time()
print(f"The function {func.__name__} was run {t2 - t1} seconds")
return result
return wrapper
#mytimer
def TimeConsumingFunction():
time.sleep(3)
print("Hello timers")
TimeConsumingFunction()
Output:
/usr/bin/python3.7 /home/user/Documents/python-workspace/timers/example.py
Hello timers
The function TimeConsumingFunction was run 3.002610206604004 seconds
Process finished with exit code 0

Related

call several time the same subprocess python function

I need to process-parallelize some computations that are done several time.
So the subprocess python function has to keep alive between two calls.
In a perfect world I would need something like that:
class Computer:
def __init__(self, x):
self.x = x
# Creation of quite heavy python objects that cannot be pickled !!
def call(self, y):
return x+y
process = Computer(4) ## NEED MAGIC HERE to keep "call" alive in a subprocess !!
print(process.call(1)) # prints 5 (=4+1)
print(process.call(12)) # prints 16 (=4+12)
I can follow this answer and communicate via asyncio.subprocess.PIPE, but in my actual use case,
the call argument is a list of list of integers
the call answer is a list of strings
Thus it could be cool to avoid to serialize/deserialize the arguments and return values by hand.
Any ideas of how to keep the function call "alive" and ready to receive new calls ?
Here is an answer, based on this one, but
several subprocesses are created
each subprocess has its own identifier
their calls are parallelized
a small layer to allow exchange of jsons instead of plain byte strings.
hello.py
#!/usr/bin/python3
# This is the taks to be done.
# A task consist in receiving a json assumed to be
# {"vector": [...]}
# and return a json with the length of the vector and
# the worker id.
import sys
import time
import json
ident = sys.argv[1]
while True:
str_data = input()
data = json.loads(str_data)
command = data.get("command", None)
if command == "quit":
answer = {"comment": "I'm leaving",
"my id": ident}
print(json.dumps(answer), end="\n")
sys.exit(1)
time.sleep(1) # simulates 1s of heavy work
answer = {"size": len(data['vector']),
"my id": ident}
print(json.dumps(answer), end="\n")
main.py
#!/usr/bin/python3
import json
from subprocess import Popen, PIPE
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
dprint = print
def create_proc(arg):
cmd = ["./hello.py", arg]
process = Popen(cmd, stdin=PIPE, stdout=PIPE)
return process
def make_call(proc, arg):
"""Make the call in a thread."""
str_arg = json.dumps(arg)
txt = bytes(str_arg + '\n', encoding='utf8')
proc.stdin.write(txt)
proc.stdin.flush()
b_ans = proc.stdout.readline()
s_ans = b_ans.decode('utf8')
j_ans = json.loads(s_ans)
return j_ans
def search(executor, procs, data):
jobs = [executor.submit(make_call, proc, data) for proc in procs]
answer = []
for job in concurrent.futures.as_completed(jobs):
got_ans = job.result()
answer.append(got_ans)
return answer
def main():
n_workers = 50
idents = [f"{i}st" for i in range(0, n_workers)]
executor = ThreadPoolExecutor(n_workers)
# Create `n_workers` subprocesses waiting for data to work with.
# The subprocesses are all different because they receive different
# "initialization" id.
procs = [create_proc(ident) for ident in idents]
data = {"vector": [1, 2, 23]}
answers = search(executor, procs, data) # takes 1s instead of 5 !
for answer in answers:
print(answers)
search(executor, procs, {"command": "quit"})
main()

How do I create imap checker that reads from csv file and loop for each line using multithreading?

The script only checks the first 10 lines of csv file, I want the script to iterate over all the lines of the file using threads to speed up the process.
Code:
import time
import csv
import imaplib
from threading import Thread
combo = []
FileToOpen = open("emails.csv", "r")
csvDictReader = csv.DictReader(FileToOpen)
successEmail = open("SuccessEmails.txt", "a")
for email in csvDictReader:
combo.append(email)
rows_count = len(list(csvDictReader))
t1 = time.perf_counter()
combo_new = combo
def ConnectorImap(combo_new):
for Email in combo_new:
login = Email['login']
password = Email['password']
imap_serv = "imap." + login.split('#')[-1]
mail_serv = "mail." + login.split('#')[-1]
try:
print(login,password,imap_serv)
print('logging in as %s' % login)
# create an IMAP4 class with SSL
imap_ssl = imaplib.IMAP4_SSL(imap_serv)
resp_code, response = imap_ssl.login(login,password)
print(resp_code)
if resp_code == "OK":
successEmail.write(login + ',' + password)
successEmail.write("\n")
imap_ssl.logout()
except Exception as e:
print(e)
pass
threads = []
for idx, line in enumerate(rows_count):
# We start one thread per url present.
process = Thread(target=ConnectorImap, args=(combo_new))
process.start()
threads.append(process)
for process in threads:
process.join()
t2 = time.perf_counter()
print(f'finished in{t2 - t1} seconds')
I am new to python please any help! I want the script to iterate over all the lines of the file using threads to speed up the process.

Why serial code is faster than concurrent.futures in this case?

I am using the following code to process some pictures for my ML project and I would like to parallelize it.
import multiprocessing as mp
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
#parallel call of the function
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
ood_id = ex.map(track_ids, seqs)
if I run the code serially it takes 3.0 minutes but for parallel with concurrent, it takes 3.5 minutes.
can someone please explain why is that? and present a way to solve the problem.
btw, I have 12 cores.
Thanks
Here's a brief example of how one might go about profiling multiprocessing code vs serial execution:
from multiprocessing import Pool
from cProfile import Profile
from pstats import Stats
import concurrent.futures
def track_ids(seq):
'''The func is so big I can not put it here'''
ood = {}
for i in seq:
# I load around 500 images and process them
ood[i] = some Value
return ood
def profile_seq():
p = Profile() #one and only profiler instance
p.enable()
seqs = []
for seq in range(1, 10):# len(seqs)+1):
seq = txt+str(seq)
seqs.append(seq)
# serial call of the function
track_ids(seq)
p.disable()
return Stats(p), seqs
def track_ids_pr(seq):
p = Profile() #profile the child tasks
p.enable()
retval = track_ids(seq)
p.disable()
return (Stats(p, stream="dummy"), retval)
def profile_parallel():
p = Profile() #profile stuff in the main process
p.enable()
with concurrent.futures.ProcessPoolExecutor(max_workers=mp.cpu_count()) as ex:
retvals = ex.map(track_ids_pr, seqs)
p.disable()
s = Stats(p)
out = []
for ret in retvals:
s.add(ret[0])
out.append(ret[1])
return s, out
if __name__ == "__main__":
stat, retval = profile_parallel()
stat.print_stats()
EDIT: Unfortunately I found out that pstat.Stats objects cannot be used normally with multiprocessing.Queue because it is not pickleable (which is needed for the operation of concurrent.futures). Evidently it normally will store a reference to a file for the purpose of writing statistics to that file, and if none is given, it will by default grab a reference to sys.stdout. We don't actually need that reference however until we actually want to print out the statistics, so we can just give it a temporary value to prevent the pickle error, and then restore an appropriate value later. The following example should be copy-paste-able and run just fine rather than the pseudocode-ish example above.
from multiprocessing import Queue, Process
from cProfile import Profile
from pstats import Stats
import sys
def isprime(x):
for d in range(2, int(x**.5)):
if x % d == 0:
return False
return True
def foo(retq):
p = Profile()
p.enable()
primes = []
max_n = 2**20
for n in range(3, max_n):
if isprime(n):
primes.append(n)
p.disable()
retq.put(Stats(p, stream="dummy")) #Dirty hack: set `stream` to something picklable then override later
if __name__ == "__main__":
q = Queue()
p1 = Process(target=foo, args=(q,))
p1.start()
p2 = Process(target=foo, args=(q,))
p2.start()
s1 = q.get()
s1.stream = sys.stdout #restore original file
s2 = q.get()
# s2.stream #if we are just adding this `Stats` object to another the `stream` just gets thrown away anyway.
s1.add(s2) #add up the stats from both child processes.
s1.print_stats() #s1.stream gets used here, but not before. If you provide a file to write to instead of sys.stdout, it will write to that file)
p1.join()
p2.join()

Python Multithreading Producer Consumer Pattern

I'm still learning how to code and these are my first attempts at multithreading.
I've read a bunch of multithreading articles. I thought these were very helpful:
Processing single file from multiple processes
Python module of the week: multiprocessing
Producer-consumer problem in Python
Multiprocessing
There's quite a lot to think about. Especially for a beginner.
Unfortunately, when I try to put this information into practice my code isn't quite working.
The idea behind this code is to read simplified.txt which contains lines of comma delimited numbers. Eg: 0.275,0.28,0.275,0.275,36078.
The producer thread reads each line and strips the newline character from the end of the line. Then each number in the line is split and assigned a variable.
Variable1 is then placed into the queue.
The consumer thread will pick up items in the queue, square it, then add an entry into the log file.
The code I am using comes from this template. This is the code I have so far:
import threading
import queue
import time
import logging
import random
import sys
read_file = 'C:/temp/temp1/simplified.txt'
log1 = open('C:/temp/temp1/simplified_log1.txt', "a+")
logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-9s) %(message)s',)
BUF_SIZE = 10
q = queue.Queue(BUF_SIZE)
class ProducerThread(threading.Thread):
def __init__(self, name, read_file):
super(ProducerThread,self).__init__()
self.name = name
self.read_file = read_file
def run(self, read_file):
while True:
if not q.full():
with open(read_file, 'r') as f:
for line in f:
stripped = line.strip('\n\r')
value1,value2,value3,value4,value5,value6,value7 = stripped.split(',')
q.put(value1)
logging.debug('Putting ' + str(value1) + ' : ' + str(q.qsize()) + ' items in queue')
time.sleep(random.random())
return
class ConsumerThread(threading.Thread):
def __init__(self, name, value1, log1):
super(ConsumerThread,self).__init__()
self.name = name
self.value1 = value1
self.log1 = log1
return
def run(self):
while True:
if not q.empty():
value1 = q.get()
sqr_value1 = value1 * value1
log1.write("The square of " + str(value1) + " is " + str(sqr_value1))
logging.debug('Getting ' + str(value1) + ' : ' + str(q.qsize()) + ' items in queue')
time.sleep(random.random())
return
if __name__ == '__main__':
p = ProducerThread(name='producer')
c = ConsumerThread(name='consumer')
p.start()
time.sleep(2)
c.start()
time.sleep(2)
When I run the code, I get this error:
Traceback (most recent call last):
File "c:/Scripta/A_Simplified_Producer_Consumer_Queue_v0.1.py", line 60, in <module>
p = ProducerThread(name='producer')
TypeError: __init__() missing 1 required positional argument: 'read_file'
I don't know where else I need to add read_file.
Any help would be greatly appreciated. Thanks in advance.
Your ProducerThread class requires 2 parameters (name and read_file) as arguments to its constructor as defined in its __init__ method, where you only provide the first such argument when you create an instance in your main block. You have the same problem with your second class.
You should either provide the read_file to the constructors when creating instances or just remove it from the constructor signature since you don't appear to use it anyways (you use the read_file passed into run function, but I don't think that is correct). Seems like you're attempting to override that method from the Thread superclass and I doubt that takes such a parameter.
Thank you userSeventeen for setting me on the right path.
I thought that in order to use outside variables I needed to place them in the init method, then again into the run method. You've clarified that I only needed to use the variables in the run methods.
This is the working code. I had to remove the while true: statement as I did not want the code to run forever.
import threading
import queue
import time
import logging
import random
import sys
import os
read_file = 'C:/temp/temp1/simplified.txt'
log1 = open('C:/temp/temp1/simplified_log1.txt', "a+")
logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-9s) %(message)s',)
BUF_SIZE = 10
q = queue.Queue(BUF_SIZE)
class ProducerThread(threading.Thread):
def __init__(self, name):
super(ProducerThread,self).__init__()
self.name = name
def run(self):
with open(read_file, 'r') as f:
for line in f:
stripped = line.strip('\n\r')
value1,value2,value3,value4,value5 = stripped.split(',')
float_value1 = float(value1)
if not q.full():
q.put(float_value1)
logging.debug('Putting ' + str(float_value1) + ' : ' + str(q.qsize()) + ' items in queue')
time.sleep(random.random())
return
class ConsumerThread(threading.Thread):
def __init__(self, name):
super(ConsumerThread,self).__init__()
self.name = name
return
def run(self):
while not q.empty():
float_value1 = q.get()
sqr_value1 = float_value1 * float_value1
log1.write("The square of " + str(float_value1) + " is " + str(sqr_value1))
logging.debug('Getting ' + str(float_value1) + ' : ' + str(q.qsize()) + ' items in queue')
time.sleep(random.random())
return
if __name__ == '__main__':
p = ProducerThread(name='producer')
c = ConsumerThread(name='consumer')
p.start()
time.sleep(2)
c.start()
time.sleep(2)

Python watchdog module duplicate events (edit: was not an watchdog issue)

I am creating a python script that will identify changes to a log file and print some data from the new logs.
I use watchdog to create an event handler and everything seems to work fine except from that, I get duplicate events every time I modify the file. I checked creation and delete, they both work as expected and trigger one time.
I have read the similar question which explains having a created and a modified event when I save a file but this is not my case. I just get two modification events.
Here is my code:
import os, sys, time
import subprocess
import threading
import win32print
from tkinter import filedialog
from tkinter import *
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
class Handler(FileSystemEventHandler):
# docstring for FileSystemEventHandler
def __init__(self, observer, filename, dirname):
# super(Handler, FileSystemEventHandler).__init__(self,)
self.observer = observer
self.filename = filename
self.dirname = dirname
print("Handler filename = " , self.filename)
print("Handler dirname = " , self.dirname)
def on_modified(self, event):
if self.filename == event.src_path:
print("The file was modified")
print (event.src_path)
# go get the last line and print the data
# try:
# hJob = win32print.StartDocPrinter (hPrinter, 1, ("test of raw data", None, "RAW"))
# try:
# win32print.StartPagePrinter (hPrinter)
# win32print.WritePrinter (hPrinter, raw_data)
# win32print.EndPagePrinter (hPrinter)
# finally:
# win32print.EndDocPrinter (hPrinter)
# finally:
# win32print.ClosePrinter (hPrinter)
def on_created(self, event):
print("A file was created (", event.src_path, ")")
def on_deleted(self, event):
print("A file was deleted (", event.src_path, ")")
if __name__ == "__main__":
Flags=2
Name=None
Level=1
printers = win32print.EnumPrinters(Flags, Name, Level)
print("\nChoose a printer to use:")
i=1
for p in printers:
print(i,')' , p[2])
i = i+1
if sys.version_info >= (3,):
raw_data = bytes ("This is a test", "utf-8")
else:
raw_data = "This is a test"
printer = int(input())
printer_name = printers[printer-1][2] #win32print.GetDefaultPrinter ()
print("You chose ", printer_name, "\nI will now print from the specified file with this printer")
hPrinter = win32print.OpenPrinter (printer_name)
# root = Tk()
# root.filename = filedialog.askopenfilename(initialdir = "/Desktop",title = "Select file",filetypes = (("log files","*.log"),("all files","*.*")))
file_path = "some_file_path" # root.filename
file_directory = os.path.dirname(file_path)
# print (file_path)
print (file_directory)
observer = Observer()
event_handler = Handler(observer, file_path, file_directory)
observer.schedule(event_handler, path=file_directory, recursive=False)
observer.start()
observer.join()
any ideas would be appreciated
EDIT:
After some debugging I found out that Windows10 is changing the file modification time twice every time I save it.
The proof of concept code is this:
prev_modification_time = os.path.getmtime(file_path)
while True:
current_mod_time = os.path.getmtime(file_path)
if prev_modification_time != current_mod_time :
print ("the file was modified, last modification time is: ", current_mod_time)
prev_modification_time = current_mod_time
pass
Final edit:
After testing my code on linux (Debian Stretch to be exact) it worked like a charm. So this combined with the previous edit probably shows that watchdog works fine and it is windows10 that has some issue. Should I post it on a different question or here?

Resources