Multiprocessing queue of files in folders - python-3.x

Question is how to properly process files in Python 3.7 multiprocessing in case when I'm crawling directories recursively.
My code is as following:
def f(directoryout,directoryoutfailed,datafile,filelist_failed,imagefile,rootpath,extension,debug):
[...] some processing
if __name__ == '__main__':
import csv
import os
debug = 0
timeout = 20
if debug == 0:
folder = '/home/debian/Desktop/environments/dedpul/files/fp/'
datafile = 'fpdata.csv' # file with results
directoryout = 'fp_out' # out directory for debugging
directoryoutfailed = 'fp_out_failed' # out directory for wrongly processed for debuggin mode
filelist = 'filelist.csv' # list of processed files
filelist_failed = 'filelist_failed.csv' # list of wrongly processed files
counter = 0
pool = Pool(processes=4)
for root, subFolders, files in os.walk(folder):
for imagefile in files:
rootpath = root+'/'
fullpath = root+'/'+imagefile
extension = os.path.splitext(imagefile)[1]
imagefilesplit = imagefile.split('.')[0]
counter += 1
print('\033[93m ## ',counter,' ## \033[0m',rootpath)
fileexist = 0
with open(filelist) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
if row[0] == fullpath:
fileexist = 1
if fileexist == 1:
print(' File was processed, skipping...')
continue
with open(filelist, mode='a') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([fullpath])
# print(directoryout,directoryoutfailed,datafile,filelist_failed,imagefile,rootpath,extension,debug)
res = pool.apply(f, (directoryout,directoryoutfailed,datafile,filelist_failed,imagefile,rootpath,extension,debug))
pool.close()
pool.join()
1st, when I'm using pool.apply_async it uses all cores, however it doesn't process function f() correctly. With pool.apply() it works single-threading.
2nd, as you see, I'm crawling recursively list of files in folders in loop. If file was found as processed, this loop should continue. Should I do that in __ main __ function, or it should be moved to f() function? If yes, how to exchange what is during processing, which takes a few seconds per file?
3rd, function f() is independent, so if it will process image file and then it will add results to fpdata.csv file (or add name of not-well-processed file to filelist_failed.csv) and just close processing without any problems, so no real output is needed. I need just to start this function in multiprocessing.
What I am doing wrong? Should I use
with Pool(processes=4) as pool:
statement?
Before asking this query I've browsed tons of answers, but apparently it was extremely hard to find such file processing, in Python Manual as well.

Related

Speed up the data extraction process from dicom files

I am trying to extract images from Dicom files.
My folder structure is so below -
> BATCH 4 BATCH 6 BATCH 8 Batch 29 Batch 30-35 Batch 36 Batch 37-38_1
> BATCH 5 BATCH 7 BATCH 9 Batch 29_1 Batch 30-35_1 Batch 37-38
Each batch contains thousands of dicom images.
My broad approach is below -
I am storing all the batches in a single list folder_list and then iterating through all of the batches.
single_files contains every dicom file in each batch and then I am subsequently iterating through each file in a batch.
After checking few conditions on each file, I am extracting the image - pixel_array and moving it to desired location.
The issue is it is really slow and complexity is O(n^2) , is there a way to fasten it up.
Complete code-
from pydicom import pixel_data_handlers
counter = 0
Source_folder_path = '/Path/*/'
destination_dir = '/Volumes/My Book/Extracted_Dataset'
folder_list = glob.glob(Source_folder_path)
for folder_dir in folder_list:
single_files = (glob.glob(os.path.join(folder_dir,'*')))
final_destination = os.path.join(destination_dir, folder_dir.split('/')[-2])
for i in single_files:
print(i)
dcm = pydicom.dcmread(i)
name = dcm.PatientID
dest = os.path.join(destination_dir,os.path.join(folder_dir,name))
if dcm.PhotometricInterpretation == 'RGB':
if dcm.Modality == "OP":
if os.path.isdir(dest) == False:
os.mkdir(dest)
img = dcm.pixel_array
name = dcm.PatientID+'_'+str(counter)+'.png'
counter+=1
if dcm.LossyImageCompression:
if dcm.LossyImageCompression=='00':
img = pixel_data_handlers.util.convert_color_space(img, current = 'RGB', desired = 'YBR_FULL')
image_to_write = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
cv2.imwrite(os.path.join(folder_dir,name),image_to_write)
if not os.path.isdir(final_destination):
os.makedirs(final_destination)
shutil.move(os.path.join(folder_dir,name),final_destination)
else:
shutil.move(os.path.join(folder_dir,name),final_destination)
Modified version as per the suggestion,
CPU Utilisation is below -
My IO utilisation is -
Can it be speed up more -
def ProcessOne(f):
"""Function of main process."""
counter = 0
destination_dir = '/Volumes/My Book/Extracted_Dataset'
folder_dir = f
single_files = (glob.glob(os.path.join(folder_dir, '*')))
final_destination = os.path.join(destination_dir, folder_dir.split('/')[-2])
for i in single_files:
print(i)
dcm = pydicom.dcmread(i)
name = dcm.PatientID
dest = os.path.join(destination_dir, os.path.join(folder_dir, name))
if dcm.PhotometricInterpretation == 'RGB':
if dcm.Modality == "OP":
if not os.path.isdir(dest):
os.mkdir(dest)
img = dcm.pixel_array
name = dcm.PatientID+'_'+str(counter)+'.png'
counter += 1
if dcm.LossyImageCompression:
if dcm.LossyImageCompression == '00':
img = pixel_data_handlers.util.convert_color_space(img, current='RGB', desired='YBR_FULL') # noqa
image_to_write = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
cv2.imwrite(os.path.join(folder_dir, name), image_to_write)
if not os.path.isdir(final_destination):
os.makedirs(final_destination)
shutil.move(os.path.join(folder_dir, name), final_destination) # noqa
else:
shutil.move(os.path.join(folder_dir, name), final_destination) # noqa
if __name__ == '__main__':
# Create a pool of processes to check files
p = Pool()
# Create a list of files to process
Source_folder_path = '/Path/*/' # noqa
folder_list = glob.glob(Source_folder_path)
print(f'Batches to process: {len(folder_list)}')
# Map the list of files to check onto the Pool
p.map(ProcessOne, folder_list)

How best to parallelize grakn queries with Python?

I run Windows 10, Python 3.7, and have a 6-core CPU. A single Python thread on my machine submits 1,000 inserts per second to grakn. I'd like to parallelize my code to insert and match even faster. How are people doing this?
My only experience with parellelization is on another project, where I submit a custom function to a dask distributed client to generate thousands of tasks. Right now, this same approach fails whenever the custom function receives or generates a grakn transaction object/handle. I get errors like:
Traceback (most recent call last):
File "C:\Users\dvyd\.conda\envs\activefiction\lib\site-packages\distributed\protocol\pickle.py", line 41, in dumps
return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
...
File "stringsource", line 2, in grpc._cython.cygrpc.Channel.__reduce_cython__
TypeError: no default __reduce__ due to non-trivial __cinit__
I've never used Python's multiprocessing module directly. What are other people doing to parallelize their queries to grakn?
The easiest approach that I've found to execute a batch of queries is to pass a Grakn session to each thread in a ThreadPool. Within each thread you can manage transactions and of course do some more complex logic:
from grakn.client import GraknClient
from multiprocessing.dummy import Pool as ThreadPool
from functools import partial
def write_query_batch(session, batch):
tx = session.transaction().write()
for query in batch:
tx.query(query)
tx.commit()
def multi_thread_write_query_batches(session, query_batches, num_threads=8):
pool = ThreadPool(num_threads)
pool.map(partial(write_query_batch, session), query_batches)
pool.close()
pool.join()
def generate_query_batches(my_data_entries_list, batch_size):
batch = []
for index, data_entry in enumerate(my_data_entries_list):
batch.append(data_entry)
if index % batch_size == 0 and index != 0:
yield batch
batch = []
if batch:
yield batch
# (Part 2) Somewhere in your application open a client and a session
client = GraknClient(uri="localhost:48555")
session = client.session(keyspace="grakn")
query_batches_iterator = generate_query_batches(my_data_entries_list, batch_size)
multi_thread_write_query_batches(session, query_batches_iterator, num_threads=8)
session.close()
client.close()
The above is a generic method. As a concrete example, you can use the above (omitting part 2) to parallelise batches of insert statements from two files. Appending this to the above should work:
files = [
{
"file_path": f"/path/to/your/file.gql",
},
{
"file_path": f"/path/to/your/file2.gql",
}
]
KEYSPACE = "grakn"
URI = "localhost:48555"
BATCH_SIZE = 10
NUM_BATCHES = 1000
# ​Entry point where migration starts
def migrate_graql_files():
start_time = time.time()
for file in files:
print('==================================================')
print(f'Loading from {file["file_path"]}')
print('==================================================')
open_file = open(file["file_path"], "r") # Here we are assuming you have 1 Graql query per line!
batches = generate_query_batches(open_file.readlines(), BATCH_SIZE)
with GraknClient(uri=URI) as client: # Using `with` auto-closes the client
with client.session(KEYSPACE) as session: # Using `with` auto-closes the session
multi_thread_write_query_batches(session, batches, num_threads=16) # Pick `num_threads` according to your machine
elapsed = time.time() - start_time
print(f'Time elapsed {elapsed:.1f} seconds')
elapsed = time.time() - start_time
print(f'Time elapsed {elapsed:.1f} seconds')
if __name__ == "__main__":
migrate_graql_files()
You should also be able to see how you can load from a csv or any other file type in this way, but taking the values you find in that file and substitution them into Graql query string templates. Take a look at the migration example in the docs for more on that.
An alternative approach using multi-processing instead of multi-threading follows below.
We empirically found that multi-threading doesn't yield particularly large performance gains, compared to multi-processing. This is probably due to Python's GIL.
This piece of code assumes a file enumerating TypeQL queries that are independent of each other, so they can be parallelised freely.
from typedb.client import TypeDB, TypeDBClient, SessionType, TransactionType
import multiprocessing as mp
import queue
def batch_writer(database, kill_event, batch_queue):
client = TypeDB.core_client("localhost:1729")
session = client.session(database, SessionType.DATA)
while not kill_event.is_set():
try:
batch = batch_queue.get(block=True, timeout=1)
with session.transaction(TransactionType.WRITE) as tx:
for query in batch:
tx.query().insert(query)
tx.commit()
except queue.Empty:
continue
print("Received kill event, exiting worker.")
def start_writers(database, kill_event, batch_queue, parallelism=4):
processes = []
for _ in range(parallelism):
proc = mp.Process(target=batch_writer, args=(database, kill_event, batch_queue))
processes.append(proc)
proc.start()
return processes
def batch(iterable, n=1000):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
if __name__ == '__main__':
batch_size = 100
parallelism = 1
database = "<database name>"
# filePath = "<PATH TO QUERIES FILE - ONE QUERY PER NEW LINE>"
with open(file_path, "r") as file:
statements = file.read().splitlines()[:]
batch_statements = batch(statements, n=batch_size)
total_batches = int(len(statements) / batch_size)
if total_batches % batch_size > 0:
total_batches += 1
batch_queue = mp.Queue(parallelism * 4)
kill_event = mp.Event()
writers = start_writers(database, kill_event, batch_queue, parallelism=parallelism)
for i, batch in enumerate(batch_statements):
batch_queue.put(batch, block=True)
if i*batch_size % 10000 == 0:
print("Loaded: {0}/{1}".format(i*batch_size, total_batches*batch_size))
kill_event.set()
batch_queue.close()
batch_queue.join_thread()
for proc in writers:
proc.join()
print("Done loading")

Split big file in multiple files in python3.x

I want to split the file into multiple files if file size of file_write is greater than 20MB.
In Random function, I am opening big_file.txt and removing noise using remove_noise() and writing clean line to outfile.
I am not sure how to split the file based on the size in my current implementation. Please find the code below:
(Apologies for not providing proper implementation with example because it is really complicated)
I have gone through the example at this link: Split large text file(around 50GB) into multiple files
import os
def parses(lines, my_date_list):
for line in reversed(list(lines)):
line = line.strip()
if not line:
continue
date_string = "2019-11-01" # assumption
yield date_string, line
def remove_noise(line):
""" dummy function"""
return line
def random_function(path, output, cutoff="2019-10-31"):
my_date_list = []
if os.path.exists(path):
with open(path) as f:
lines = parses(f, my_date_list)
for date, line in lines:
if cutoff <= date:
results = remove_noise(line)
output.write(results + '\n')
continue
else:
break
While writing lines to output, I need to check size. If size reached 20MB and I want to write it to second {may be output_2} and so on.
if __name__ == '__main__':
path = "./big_file.txt"
file_write = "./write_file.txt"
with open(file_write) as outfile:
random_function(path=path, output=outfile)

Automation via python

Completely new to python so forgive me if this a dumb question.
Part of my working tasks is to upgrade the IOS on various Cisco routers and switches.
The most mind numbing part of this is comparing the pre change config with the post change config.
I use ExamDiff for this but with up to 100 devices each night this gets soul destroying.
Is it possible to get python to open ExamDiff and automatically compare the pre and post checks, saving the differences to a file for each device?
I know I can use the import os command to open ExamDiff but I have no idea how to get ExamDiff to work
Can someone point me in the right direction?
Thanks
I got this..........
Works pretty well
#!/usr/bin/python
import os
path = input("Enter the files location: ")
def nexus():
rootdir = path + os.sep
filelist = os.listdir(rootdir)
if filelist:
for file in filelist:
if 'pre' in file:
prefile = file
postfile = file.replace('pre', 'post')
resultfile = file.replace('pre', 'report')
if postfile in filelist:
prefile = rootdir + prefile
postfile = rootdir + postfile
resultfile = rootdir + resultfile
compare(prefile, postfile, resultfile)
else:
print('No corresponding "post"-file to {0}.'.format(prefile))
else:
print('No files found.')
def compare(file1loc, file2loc, comparefileloc):
with open(file1loc, 'r') as file1:
file1lines = file1.readlines()
file1lines = [x.strip() for x in file1lines] # getting rid of whitespace and breaks
with open(file2loc, 'r') as file2:
file2lines = file2.readlines()
file2lines = [x.strip() for x in file2lines] # getting rid of whitespace and breaks
with open(comparefileloc, 'w') as comparefile:
comparefile.write('===== IN FILE 1 BUT NOT FILE 2 =====\r\n')
for file1line in file1lines:
if not file1line in file2lines:
comparefile.write(file1line + '\r\n')
comparefile.write('\r\n')
comparefile.write('===== IN FILE 2 BUT NOT FILE 1 =====\r\n')
for file2line in file2lines:
if not file2line in file1lines:
comparefile.write(file2line + '\r\n')
if __name__ == '__main__':
nexus()

Save CSV for every functioncall with another name

at the moment I am able to create one CSV file with all the content I get at once.
Now I would like to create a list where I have different names in it.
How can I produce for every functioncall a different CSV file name? I thought about looping a list but I just want a +1 iteration at each call. I thought about saving my state somehow and use it in next functioncall. Everytime I initialize my variable with 0 and so I don't get 1. I think I could do it with Python Function Parameter calls but I have no idea how to use it. Can someone give me a little tip or example? If there are better ideas (maybe my idea is totally bullshit), how to solve this, just help please.
The comments in the code shall represent my imagination.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from tenable.sc import SecurityCenter as SC
import os.path
import sys
import getpass
import csv
SC_HOST = '...'
def parse_entry(entry):
split_after_path = ''
ip = entry.get('ip', None)
pluginText = entry.get('pluginText', None)
if 'Path : ' in pluginText:
for line in pluginText.splitlines(0):
if 'Path : ' in line:
split_after_path_in_plugintext = line.split("Path : ",1)[1]
# place = ['place1', 'place2', 'place3', 'place4', 'place5']
# i = 0
# i = i+1
file_exists = os.path.isfile('testfile_path.csv')
# file_exists = os.path.isfile('testfile_path_'+place[i]+'.csv')
data = open('testfile_path.csv', 'a')
# data = open('testfile_path_'+place[i]+'.csv', 'a')
with data as csvfile:
header = ['IP Address', 'Path']
writer = csv.DictWriter(csvfile, lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC, fieldnames=header)
if not file_exists:
writer.writeheader()
writer.writerow({'IP Address': ip, 'Path': split_after_path})
data.close()
def main():
sc_user = input('[<<] username: ')
sc_pass = getpass.getpass('[<<] password: ')
sc = SC(SC_HOST)
sc.login(sc_user, sc_pass)
# Query API for data
# asset = [12,13,14,25,29]
# i = 0
# assetid = asset[i]
# vuln = sc.analysis.vulns(('pluginID', '=', '25072')('asset','=','assetid'))
# i = i+1
vuln = sc.analysis.vulns(('pluginID', '=', '25072'),('asset','=','11'))
for entry in vuln:
parse_entry(entry)
sc.logout()
return 0
if __name__ == '__main__':
sys.exit(main())
The simplest and most obvious solution is to pass the full file path to your parse_entry function, ie:
def parse_entry(entry, filepath):
# ...
if 'Path : ' in pluginText:
for line in pluginText.splitlines(0):
if 'Path : ' in line:
# ...
file_exists = os.path.isfile(filepath)
with open(filepath, 'a') as csvfile:
# ...
Then in main() use enumerate() to build sequential filenames:
def main():
# ...
for i, entry in enumerate(vuln):
path = "'testfile_path{}.csv".format(i)
parse_entry(entry, path)
You can use a function attribute to keep track of the number of times the function has been called.
def parse_entry(entry):
parse_entry.i += 1
# outside the function you have to initialize the attribute
parse_entry.i = 0
Or you can look at other ways to initialize the function attribute in this post.
Alternatively, you can use glob to get the current number of files.
from glob import glob
i = len(glob('testfile_path_*.csv'))

Resources