Read multiple files multiprocessing - python-3.x

I have a simple function that scans files for a special string, but as these files are on a slow remote file storage, I need to scan them parallel.
I guess I need to use multiprocessing, but I am not sure how to do that correctly.
Here is my function:
from fnmatch import fnmatch
import os
from shutil import copy
from pprint import pprint
def getFailedFile(directory_name, folder_to_write):
for file in os.listdir(directory_name):
if fnmatch(file, '*Response.txt'):
filename = directory_name + file
try:
with open(filename, 'r', encoding='utf-8') as myfile:
data = myfile.read()
if data.find('Exception') != -1:
try:
requestFile = directory_name + file.replace('Response', 'Request')
copy(requestFile, os.getcwd() + folder_to_write)
except FileNotFoundError:
print('no such file - ', requestFile)
except UnicodeDecodeError:
print('error unicode decode -', filename)
directory_name = 'some folder'
folder_to_write = 'some folder_to_write'
getFailedFile(directory_name=directory_name, folder_to_write)
Please help. Currently it takes about 4 hours due to number of files in the destination folder.

Finally figured out how to do that:
from fnmatch import fnmatch
import os
from shutil import copy
from multiprocessing import Pool
import time
import logging
def process_file(file):
directory_name = 'directory with files'
if fnmatch(file, '*Response.txt'):
filename = directory_name + file
try:
with open(filename, 'r', encoding='utf-8') as myfile:
data = myfile.read()
if data.find('xception') != -1:
try:
requestFile = directory_name + file.replace('Response', 'Request')
responseFile = directory_name + file
try:
copy(requestFile, 'directory to write')
copy(responseFile, 'directory to write')
except Exception as e:
logging.info(str(e) + '\n')
print(str(e))
except FileNotFoundError:
print('no such file - ', requestFile)
logging.info('no such file - ' + str(requestFile) + '\n')
except UnicodeDecodeError:
print('error unicode decode -', filename)
logging.info('error unicode decode -' + str(filename) + '\n')
if __name__ == '__main__':
try:
directory_name = 'directory with files'
number_of_processes = 50
logging.info('\n' + 'Number of processes - ' + str(number_of_processes))
logging.info('Directory to scan ' + directory_name)
pool = Pool(number_of_processes)
start_time = time.time()
pool.map(process_file, os.listdir(directory_name))
pool.close()
elapsed_time = time.time() - start_time
logging.info('Elapsed time - ' + str(elapsed_time / 60) + '\n')
except Exception as e:
logging.info(str(e) + '\n')
I know that the code is not so pretty, but it works 27 minutes instead of previous elapsed time.

Related

Python, selecting a specific file

I need to select a specific file, after specifying the number of this file in "if"...
Instead of "MTS.txt " we need the full path to the selected file using a digit!
I have not found any information about this on the Internet. Perhaps he made up his request badly!
from email import message
from click import clear
import pywhatkit
import pyfiglet
import os
import time
directory = 'MTS/'
# files = os.listdir(directory)
f = True
while (f==True):
def main():
os.system('CLS')
# print(files)
mtstxt = pyfiglet.figlet_format("Wa Helper", font = "smslant")
print(mtstxt)
def send_message_inst():
mobile = '+7' + input(' Номер: +7')
with os.scandir(directory) as files:
for i, file in enumerate(files):
print(' ', i + 1, file.name)
nof = os.listdir(path="MTS/")
# print(len(nof))
while True:
tarif = int(input(' Вариант: '))
if tarif >= 1 and tarif <= int(len(nof)):
msg = open("MTS.txt", encoding='UTF-8')
message = msg.read()
msg.close()
break
else:
os.system('CLS')
print(mtstxt)
print(' Номер:', mobile)
with os.scandir(directory) as files:
for i, file in enumerate(files):
print(' ', i + 1, file.name)
print("\033[31m{}\033[0m".format(" Ошибка - неверный вариант!"))
pywhatkit.sendwhatmsg_instantly(phone_no=mobile, message=message)
send_message_inst()
os.system('CLS')
sndtxt = pyfiglet.figlet_format("Sent!", font = "smslant")
print(sndtxt)
time.sleep(5)
if __name__ == '__main__':
main()
else:
break

Chose which function to run in script

I'm trying to make it so that the user chooses which function to run using if.
import os
import csv
import collections
import datetime
import pandas as pd
import time
import string
import re
import glob, os
folder_path = 'C:/ProgramData/WebPort/system/tags'
folder2_path = 'C:/ProgramData/WebPort/system'
search2_str = '"Prefix"'
print("Choices:\n 1 - Read from CSV\n 2 - Read from WPP")
x = input("Please enter your choice:\n")
x = int(x)
if x == 1:
csv_file_list = glob.glob(folder_path + '/*.csv')
with open("csv.txt", 'w') as wf:
for file in csv_file_list:
print(glob.glob(folder_path + '/*.csv'))
with open(file) as rf:
for line in rf:
if line.strip(): # if line is not empty
if not line.endswith("\n"):
line+="\n"
wf.write(line)
print('Reading from .csv')
elif x == 2:
for root, dirs, files in os.walk(folder2_path):
for file in files:
if file.endswith(".wpp"):
print(os.path.join(root, file))
with open(os.path.join(root, file), 'r') as fr, open ("wpp.txt",'a', encoding='utf-8') as fw:
for i,line in enumerate(fr):
if line.find(search2_str) != -1:
fw.write(line)
print('Reading from .wpp')
else:
print('wrong choice')
Getting Invalid syntax in line 34 using this.

Best way to check the PDF file is corrupt using python

I try to check the PDF files are corrupted in windows environment and come up with following python code.
Just want to check is it the best way to check corrupted PDF files or is there any other easy way?
Note: C:\Temp\python\sample-map (1).pdf is the corrupted PDF file
Here is the sample code
import os
import subprocess
import re
from subprocess import Popen, PIPE
def checkFile(fullfile):
proc=subprocess.Popen(["file", "-b", fullfile], shell=True, stdout=PIPE, stderr=PIPE, bufsize=0)
# -b, --brief : do not prepend filenames to output lines
out, err = proc.communicate()
exitcode = proc.returncode
return exitcode, out, err
def searchFiles(dirpath):
pwdpath=os.path.dirname(os.path.realpath(__file__))
print("running path : %s" %pwdpath )
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles=os.listdir(dirpath)
for files in listfiles:
fullfile=os.path.join(dirpath, files)
if os.access(fullfile, os.R_OK):
code, out, error = checkFile(fullfile)
if str(code) !="0" or str(error, "utf-8") != "" or re.search("^(?!PDF(\s)).*", str(out,'utf-8')):
print("ERROR " + fullfile+"\n################")
else:
print("OK " + fullfile+"\n################")
else:
print("$s : File not readable" %fullfile)
else:
print("Path is not valid")
if __name__ == "__main__":
searchFiles('C:\Temp\python')
sample output :
$ "C:/Program Files (x86)/Python37-32/python.exe" c:/Users/myuser/python/check_pdf_file.py
running path : c:\Users\myuser\python
Path C:\Temp\python validation OK
OK C:\Temp\python\Induction Guide.pdf
################
ERROR C:\Temp\python\sample-map (1).pdf
################
OK C:\Temp\python\sample-map.pdf
################
I think you can use PyPDF2 module.
pip install pypdf2
The code is as follows.
from PyPDF2 import PdfFileReader
import os
def checkFile(fullfile):
with open(fullfile, 'rb') as f:
try:
pdf = PdfFileReader(f)
info = pdf.getDocumentInfo()
if info:
return True
else:
return False
except:
return False
def searchFiles(dirpath):
pwdpath = os.path.dirname(os.path.realpath(__file__))
print("running path : %s" %pwdpath )
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles = os.listdir(dirpath)
for f in listfiles:
fullfile = os.path.join(dirpath, f)
if checkFile(fullfile):
print("OK " + fullfile + "\n################")
else:
print("ERROR " + fullfile + "\n################")
else:
print("Path is not valid")
if __name__ == "__main__":
searchFiles('C:\Temp\python')
I tried to match your coding style.
I think this code can also be used on MacOS or Linux.

how to convert a directory of image on png to jpg in python

from PIL import Image
from os import listdir
from os.path import splitext
import cv2
target_directory = r"E:\pre\png"
target = '.jpg'
jpg_folder_path = r"E:\pre\jpeg"
for file in listdir(target_directory):
filename, extension = splitext(file)
try:
if extension not in ['.py', target]:
im = Image.open(filename + extension)
#im.save((os.path.join(jpg_folder_path, im))filename + target)
cv2.imwrite(os.path.join(jpg_folder_path , filename + target), im)
except OSError:
print('Cannot convert %s' % file)
OUTPUT
Cannot convert 000c1434d8d7.png
Cannot convert 00a8624548a9.png
..
Use pathlib for the filesystem access. That is more pythonic way to do.
from pathlib import Path
from PIL import Image
inputPath = Path("E:/pre/png")
inputFiles = inputPath.glob("**/*.png")
outputPath = Path("E:/pre/jpeg")
for f in inputFiles:
outputFile = outputPath / Path(f.stem + ".jpg")
im = Image.open(f)
im.save(outputFile)
This worked like this:
import cv2, os
def tif_to_jpeg_converter(filePath):
base_path = filePath
new_path = filePath
for infile in os.listdir(base_path):
# print("file : " + infile)
read = cv2.imread(base_path + infile)
outfile = infile.split('.')[0] + '.jpg'
cv2.imwrite(new_path + outfile, read, [int(cv2.IMWRITE_JPEG_QUALITY), 200])
# Deleting the .tiff file after converting
if infile[-3:] == "tif":
print(infile)
os.remove(filePath + '/' + infile)
# check if file exists or not
if __name__ == "__main__":
print("cleaning the files")
tif_to_jpeg_converter("images/")

Python permission error unskippable

I'm trying to make an antivirus in python, but I keep getting a permissions denied error when i get its hash and check even when I make it ignore the error and try and move on to the next file to hash and check. I'm fine with the error, but I'm trying it to make it move to the next file and nothing I try works. I'm using windows.
AV.py
#!/usr/bin/python3.5
import os
import sys
import time
import glob
import simple_hasher
from colorama import *
init()
#Begin code
#variable_set
virlfile = "ListFile.hash"
vircount = 0
virusdel = []
hashtype = "MD5"
#variable_end
#fileloadhash
if not os.path.isfile(virlfile):
try:
with open(virlfile, "w") as f:
f.close
except IOError as err:
print("Error, Something went WRONG!")
print("Error as reported:")
print("{}".format(err))
with open(virlfile) as f:
data = f.read()
#scanner
filecount = 0
try:
for file in glob.glob("./**/*", recursive = True):
if os.path.isfile(file):
file_hash = simple_hasher.get_hash(file, hashtype)
if file_hash in data:
print("{} | {}".format(file_hash, Fore.CYAN + file + " (!)ALERT" + Style.RESET_ALL))
vircount += 1
virusdel.append(file)
filecount += 1
else:
print("{} | {}".format(file_hash, file))
filecount += 1
except OSError:
pass
if vircount > 0:
print("\n{} Viruses Detected. Delete? Y/N".format(vircount))
try:
choice = input(">> ").lower().split(" ")
except KeyboardInterrupt:
print("SCAN ABORTED!")
try:
if choice[0] == "y":
for file in virusdel:
try:
os.remove(file)
print("File Deleted - {}".format(os.path.abspath(file)))
except Exception as err:
print ("Unable to remove file.\n{}".format(err))
print("\n(+) All Viruses Removed.\n")
else:
sys.exit()
except Exception as err:
print("Error: {}".format(err))
else:
print("\n(+)No viruses\n")
sys.exit()
I also have a custom module being imported:
simple_hasher.py
import hashlib
import os
import sys
def get_hash(file, ver):
if ver.lower() == "md5":
h = hashlib.md5()
elif ver.lower() == "sha1":
h = hashlib.sha1()
elif ver.lower() == "sha256":
h = hashlib.sha256()
else:
h = hashlib.sha1()
while not False:
try:
with open(file, "rb") as f:
while True:
data = f.read(2 ** 20)
if not data: break
h.update(data)
return h.hexdigest()
except Exception as err:
print("[Debug] [{}] - Message: {}".format(os.path.split(__file__)[1], err))
pass
The error:
[Debug] [simple_hasher.py] - Message: [Errno 13] Permission denied: '.\\hiberfil.sys'

Resources