How to distinguish a PDF file from other files? - python-3.x

I have to deal with lot of files. How can I distinguish which one is a PDF file and which one is not ? I am running Python on Windows. Thanks for help please.

If you want to rely on the file extension, you can use the following code:
#!python3
import os
def isPDFfile(fname):
name, ext = os.path.splitext(fname)
return ext.lower() == '.pdf'
if __name__ == '__main__':
for fname in os.listdir('.'):
if isPDFfile(fname):
print(fname, 'is PDF file.')
else:
print(fname, 'is not PDF file.')
If you want to be sure that the name is not a directory, you can add the test:
def isPDFfile(fname):
if not os.path.isfile(fname):
return False
name, ext = os.path.splitext(fname)
return ext.lower() == '.pdf'
There is also os.walk() function that iterates through the files in a directory. If you want to find all PDF files inside a directory, you can write your own specialized walk that will return only PDF files:
def walkPDFfiles(directory):
for dirpath, dirs, files in os.walk(directory):
for fname in files:
name, ext = os.path.splitext(fname)
if ext.lower() == '.pdf':
yield os.path.join(dirpath, fname)
And you can use it in the loop like that:
for fname in walkPDFfiles('.'):
print(fname, 'is PDF file.')

if you don't trust the extension of the file name, you can read the first few bytes of the file and test if it starts with %PDF-
Like so:
with open(fn, 'rb') as fin:
line=fin.read(20)
if line.startswith('%PDF-'):
# its a pdf file...
# you can parse the version of PDF by the versioning x.x after %PDF-x.x
else:
# it is not a pdf file

Related

Download every file and every subdirectory from an ftp server [duplicate]

This will not download the contents of sub-directories; how can I do so?
import ftplib
import configparser
import os
directories = []
def add_directory(line):
if line.startswith('d'):
bits = line.split()
dirname = bits[8]
directories.append(dirname)
def makeDir(archiveTo):
for dir in directories:
newDir = os.path.join(archiveTo, dir)
if os.path.isdir(newDir) == True:
print("Directory \"" + dir + "\" already exists!")
else:
os.mkdir(newDir)
def getFiles(archiveTo, ftp):
files = ftp.nlst()
for filename in files:
try:
directories.index(filename)
except:
ftp.retrbinary('RETR %s' % filename, open(os.path.join(archiveTo, filename), 'wb').write)
def runBackups():
#Load INI
filename = 'connections.ini'
config = configparser.SafeConfigParser()
config.read(filename)
connections = config.sections()
i = 0
while i < len(connections):
#Load Settings
uri = config.get(connections[i], "uri")
username = config.get(connections[i], "username")
password = config.get(connections[i], "password")
backupPath = config.get(connections[i], "backuppath")
archiveTo = config.get(connections[i], "archiveto")
#Start Back-ups
ftp = ftplib.FTP(uri)
ftp.login(username, password)
ftp.cwd(backupPath)
#Map Directory Tree
ftp.retrlines('LIST', add_directory)
#Make Directories Locally
makeDir(archiveTo)
#Gather Files
getFiles(archiveTo, ftp)
#End connection and increase counter.
ftp.quit()
i += 1
print()
print("Back-ups complete.")
print()
this should do the trick :)
import sys
import ftplib
import os
from ftplib import FTP
ftp=FTP("ftp address")
ftp.login("user","password")
def downloadFiles(path,destination):
#path & destination are str of the form "/dir/folder/something/"
#path should be the abs path to the root FOLDER of the file tree to download
try:
ftp.cwd(path)
#clone path to destination
os.chdir(destination)
os.mkdir(destination[0:len(destination)-1]+path)
print destination[0:len(destination)-1]+path+" built"
except OSError:
#folder already exists at destination
pass
except ftplib.error_perm:
#invalid entry (ensure input form: "/dir/folder/something/")
print "error: could not change to "+path
sys.exit("ending session")
#list children:
filelist=ftp.nlst()
for file in filelist:
try:
#this will check if file is folder:
ftp.cwd(path+file+"/")
#if so, explore it:
downloadFiles(path+file+"/",destination)
except ftplib.error_perm:
#not a folder with accessible content
#download & return
os.chdir(destination[0:len(destination)-1]+path)
#possibly need a permission exception catch:
with open(os.path.join(destination,file),"wb") as f:
ftp.retrbinary("RETR "+file, f.write)
print file + " downloaded"
return
source="/ftproot/folder_i_want/"
dest="/systemroot/where_i_want_it/"
downloadFiles(source,dest)
This is a very old question, but I had a similar need that i wanted to satisfy in a very general manner. I ended up writing my own solution that works very well for me. I've placed it on Gist here https://gist.github.com/Jwely/ad8eb800bacef9e34dd775f9b3aad987
and pasted it below in case i ever take the gist offline.
Example usage:
import ftplib
ftp = ftplib.FTP(mysite, username, password)
download_ftp_tree(ftp, remote_dir, local_dir)
The code above will look for a directory called "remote_dir" on the ftp host, and then duplicate the directory and its entire contents into the "local_dir".
It invokes the script below.
import ftplib
import os
def _is_ftp_dir(ftp_handle, name, guess_by_extension=True):
""" simply determines if an item listed on the ftp server is a valid directory or not """
# if the name has a "." in the fourth to last position, its probably a file extension
# this is MUCH faster than trying to set every file to a working directory, and will work 99% of time.
if guess_by_extension is True:
if name[-4] == '.':
return False
original_cwd = ftp_handle.pwd() # remember the current working directory
try:
ftp_handle.cwd(name) # try to set directory to new name
ftp_handle.cwd(original_cwd) # set it back to what it was
return True
except:
return False
def _make_parent_dir(fpath):
""" ensures the parent directory of a filepath exists """
dirname = os.path.dirname(fpath)
while not os.path.exists(dirname):
try:
os.mkdir(dirname)
print("created {0}".format(dirname))
except:
_make_parent_dir(dirname)
def _download_ftp_file(ftp_handle, name, dest, overwrite):
""" downloads a single file from an ftp server """
_make_parent_dir(dest)
if not os.path.exists(dest) or overwrite is True:
with open(dest, 'wb') as f:
ftp_handle.retrbinary("RETR {0}".format(name), f.write)
print("downloaded: {0}".format(dest))
else:
print("already exists: {0}".format(dest))
def _mirror_ftp_dir(ftp_handle, name, overwrite, guess_by_extension):
""" replicates a directory on an ftp server recursively """
for item in ftp_handle.nlst(name):
if _is_ftp_dir(ftp_handle, item):
_mirror_ftp_dir(ftp_handle, item, overwrite, guess_by_extension)
else:
_download_ftp_file(ftp_handle, item, item, overwrite)
def download_ftp_tree(ftp_handle, path, destination, overwrite=False, guess_by_extension=True):
"""
Downloads an entire directory tree from an ftp server to the local destination
:param ftp_handle: an authenticated ftplib.FTP instance
:param path: the folder on the ftp server to download
:param destination: the local directory to store the copied folder
:param overwrite: set to True to force re-download of all files, even if they appear to exist already
:param guess_by_extension: It takes a while to explicitly check if every item is a directory or a file.
if this flag is set to True, it will assume any file ending with a three character extension ".???" is
a file and not a directory. Set to False if some folders may have a "." in their names -4th position.
"""
os.chdir(destination)
_mirror_ftp_dir(ftp_handle, path, overwrite, guess_by_extension)
this is an alternative. you can try using ftputil package. You can then use it to walk the remote directories and get your files
Using ftp.mlsd() instead of ftp.nlst():
import sys
import ftplib
import os
from ftplib import FTP
def fetchFiles(ftp, path, destination, overwrite=True):
'''Fetch a whole folder from ftp. \n
Parameters
----------
ftp : ftplib.FTP object
path : string ('/dir/folder/')
destination : string ('D:/dir/folder/') folder where the files will be saved
overwrite : bool - Overwrite file if already exists.
'''
try:
ftp.cwd(path)
os.mkdir(destination[:-1] + path)
print('New folder made: ' + destination[:-1] + path)
except OSError:
# folder already exists at the destination
pass
except ftplib.error_perm:
# invalid entry (ensure input form: "/dir/folder/")
print("error: could not change to " + path)
sys.exit("ending session")
# list children:
filelist = [i for i in ftp.mlsd()]
print('Current folder: ' + filelist.pop(0)[0])
for file in filelist:
if file[1]['type'] == 'file':
fullpath = os.path.join(destination[:-1] + path, file[0])
if (not overwrite and os.path.isfile(fullpath)):
continue
else:
with open(fullpath, 'wb') as f:
ftp.retrbinary('RETR ' + file[0], f.write)
print(file[0] + ' downloaded')
elif file[1]['type'] == 'dir':
fetchFiles(ftp, path + file[0] + '/', destination, overwrite)
else:
print('Unknown type: ' + file[1]['type'])
if __name__ == "__main__":
ftp = FTP('ftp address')
ftp.login('user', 'password')
source = r'/Folder/'
dest = r'D:/Data/'
fetchFiles(ftp, source, dest, overwrite=True)
ftp.quit()
Using ftputil, a fast solution could be:
def download(folder):
for item in ftp.walk(folder):
print("Creating dir " + item[0])
os.mkdir(item[0])
for subdir in item[1]:
print("Subdirs " + subdir)
for file in item[2]:
print(r"Copying File {0} \ {1}".format(item[0], file))
ftp.download(ftp.path.join(item[0],file), os.path.join(item[0],file))
It is non-trivial at least. In the simplest case, you only assume you have files and directories. This isn't always the case, there are softlinks and hardlinks and Windows-style shortcut. Softlink and directory shortcut are particularly problematic since they make recursive directory possible, which would confuse naive-ly implemented ftp grabber.
How would you handle such recursive directory depends on your need; you might simply not follow softlinks or you might try to detect recursive links. Detecting recursive link is inherently tricky, you cannot do it reliably.

Python 3 will not read appended data to file

I am trying to write binary data to a file. the program will check first if the file exists. If the file does not exist, the program will create the file and write data into it. While if it does exist the data will be appended to the file. Yet, when ever i try to read the file I cannot read the appanded data only the data written when the file was first created.
def getText(self):
self.readKey()
st = self.inBox.get('1.0', 'end')
fen = Fernet(self.readKey())
encrypted = fen.encrypt(st.encode())
return encrypted
def writeFile(self):
if (os.path.exists('data.txt') == False):
file = open('data.txt',mode='wb' )
file.write(self.getText())
file.close()
else:
file = open('data.txt',mode='ab' )
#sts = file.read()
file.write(self.getText())
file.close()
self.inBox.delete('1.0','end')
def openFile(self):
self.outBox.delete('1.0','end')
fen = Fernet(self.readKey())
try:
f = open("data.txt", mode='rb')
except:
alert_popup(self,'Error','No File Exists')
self.outBox.insert(tk.END, fen.decrypt(f.read()))
Use mode 'a'.
if (os.path.exists('data.txt') == False):
file = open('data.txt',mode='a' )
file.write(self.getText())
file.close()

Using os.walk to create a filelist for each directory

I am attempting to use os.walk to create a list of files per subdirectory, and, execute a function to merge all pdf's in each directory list. The current script appends subsequent directories to the existing list with each loop. So, pdfs in directory1 are merged successfully, but, the list for directory2 includes the pdfs from directory1 etc. I want it to refresh the list of files for each directory. Here is the script I am using currently:
import PyPDF2
import os
import sys
if len(sys.argv) > 1:
SearchDirectory = sys.argv[1]
print("I'm looking for PDF's in ", SearchDirectory)
else:
print("Please tell me the directory to look in")
sys.exit()
pdfWriter = PyPDF2.PdfFileWriter()
for root, dirs, files in os.walk(SearchDirectory):
dirs.sort()
for file in files:
files.sort()
pdfFiles = []
if file.endswith('.pdf') and ((os.path.basename(root)) == "frames"):
print("Discovered this pdf: ", os.path.join(root, file))
pdfFiles.append(os.path.join(root, file))
if pdfFiles:
for file in pdfFiles:
pdfFileObj = open(file, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
for pageNum in range(0, pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
pdfWriter.addPage(pageObj)
pdfOutput = open((os.path.split(os.path.realpath(root))[0]) + ".pdf", "wb")
pdfWriter.write(pdfOutput)
pdfOutput.close()
print("The following pdf has been successfully appended:", os.path.join(root, file))
else:
print("No pdfs found in this directory:", root)
The os.walk loop iterates once per directory. So you want to create a new PDFWriter for every directory.
It's also a good idea to use continue to bail out of the loop as soon as possible, this keeps the nesting flat.
Names that start with a capital letter are reserved for classes, so it should be searchDirectory, written with a small s.
Finally, take advantage of with blocks for handling I/O - they automatically call .close() for you.
I'm not going to install PyPDF2 just for this question, but this approach looks reasonable:
for root, dirs, files in os.walk(searchDirectory):
if not os.path.basename(root) == "frames":
continue
pdfFiles = [os.path.join(root, file) for file in sorted(files)]
if not pdfFiles:
continue
pdfWriter = PyPDF2.PdfFileWriter()
outputFile = os.path.split(os.path.realpath(root))[0] + ".pdf"
for file in pdfFiles:
print("Discovered this pdf:", file)
with open(file, 'rb') as pdfInput:
pdfReader = PyPDF2.PdfFileReader(pdfInput)
for page in pdfReader.pages:
pdfWriter.addPage(page)
with open(outputFile, "wb") as pdfOutput:
pdfWriter.write(pdfOutput)
print("%s files appended to %s" % (len(pdfFiles), outputFile))

Downloading entire directories from ftp server with python3 [duplicate]

This will not download the contents of sub-directories; how can I do so?
import ftplib
import configparser
import os
directories = []
def add_directory(line):
if line.startswith('d'):
bits = line.split()
dirname = bits[8]
directories.append(dirname)
def makeDir(archiveTo):
for dir in directories:
newDir = os.path.join(archiveTo, dir)
if os.path.isdir(newDir) == True:
print("Directory \"" + dir + "\" already exists!")
else:
os.mkdir(newDir)
def getFiles(archiveTo, ftp):
files = ftp.nlst()
for filename in files:
try:
directories.index(filename)
except:
ftp.retrbinary('RETR %s' % filename, open(os.path.join(archiveTo, filename), 'wb').write)
def runBackups():
#Load INI
filename = 'connections.ini'
config = configparser.SafeConfigParser()
config.read(filename)
connections = config.sections()
i = 0
while i < len(connections):
#Load Settings
uri = config.get(connections[i], "uri")
username = config.get(connections[i], "username")
password = config.get(connections[i], "password")
backupPath = config.get(connections[i], "backuppath")
archiveTo = config.get(connections[i], "archiveto")
#Start Back-ups
ftp = ftplib.FTP(uri)
ftp.login(username, password)
ftp.cwd(backupPath)
#Map Directory Tree
ftp.retrlines('LIST', add_directory)
#Make Directories Locally
makeDir(archiveTo)
#Gather Files
getFiles(archiveTo, ftp)
#End connection and increase counter.
ftp.quit()
i += 1
print()
print("Back-ups complete.")
print()
this should do the trick :)
import sys
import ftplib
import os
from ftplib import FTP
ftp=FTP("ftp address")
ftp.login("user","password")
def downloadFiles(path,destination):
#path & destination are str of the form "/dir/folder/something/"
#path should be the abs path to the root FOLDER of the file tree to download
try:
ftp.cwd(path)
#clone path to destination
os.chdir(destination)
os.mkdir(destination[0:len(destination)-1]+path)
print destination[0:len(destination)-1]+path+" built"
except OSError:
#folder already exists at destination
pass
except ftplib.error_perm:
#invalid entry (ensure input form: "/dir/folder/something/")
print "error: could not change to "+path
sys.exit("ending session")
#list children:
filelist=ftp.nlst()
for file in filelist:
try:
#this will check if file is folder:
ftp.cwd(path+file+"/")
#if so, explore it:
downloadFiles(path+file+"/",destination)
except ftplib.error_perm:
#not a folder with accessible content
#download & return
os.chdir(destination[0:len(destination)-1]+path)
#possibly need a permission exception catch:
with open(os.path.join(destination,file),"wb") as f:
ftp.retrbinary("RETR "+file, f.write)
print file + " downloaded"
return
source="/ftproot/folder_i_want/"
dest="/systemroot/where_i_want_it/"
downloadFiles(source,dest)
This is a very old question, but I had a similar need that i wanted to satisfy in a very general manner. I ended up writing my own solution that works very well for me. I've placed it on Gist here https://gist.github.com/Jwely/ad8eb800bacef9e34dd775f9b3aad987
and pasted it below in case i ever take the gist offline.
Example usage:
import ftplib
ftp = ftplib.FTP(mysite, username, password)
download_ftp_tree(ftp, remote_dir, local_dir)
The code above will look for a directory called "remote_dir" on the ftp host, and then duplicate the directory and its entire contents into the "local_dir".
It invokes the script below.
import ftplib
import os
def _is_ftp_dir(ftp_handle, name, guess_by_extension=True):
""" simply determines if an item listed on the ftp server is a valid directory or not """
# if the name has a "." in the fourth to last position, its probably a file extension
# this is MUCH faster than trying to set every file to a working directory, and will work 99% of time.
if guess_by_extension is True:
if name[-4] == '.':
return False
original_cwd = ftp_handle.pwd() # remember the current working directory
try:
ftp_handle.cwd(name) # try to set directory to new name
ftp_handle.cwd(original_cwd) # set it back to what it was
return True
except:
return False
def _make_parent_dir(fpath):
""" ensures the parent directory of a filepath exists """
dirname = os.path.dirname(fpath)
while not os.path.exists(dirname):
try:
os.mkdir(dirname)
print("created {0}".format(dirname))
except:
_make_parent_dir(dirname)
def _download_ftp_file(ftp_handle, name, dest, overwrite):
""" downloads a single file from an ftp server """
_make_parent_dir(dest)
if not os.path.exists(dest) or overwrite is True:
with open(dest, 'wb') as f:
ftp_handle.retrbinary("RETR {0}".format(name), f.write)
print("downloaded: {0}".format(dest))
else:
print("already exists: {0}".format(dest))
def _mirror_ftp_dir(ftp_handle, name, overwrite, guess_by_extension):
""" replicates a directory on an ftp server recursively """
for item in ftp_handle.nlst(name):
if _is_ftp_dir(ftp_handle, item):
_mirror_ftp_dir(ftp_handle, item, overwrite, guess_by_extension)
else:
_download_ftp_file(ftp_handle, item, item, overwrite)
def download_ftp_tree(ftp_handle, path, destination, overwrite=False, guess_by_extension=True):
"""
Downloads an entire directory tree from an ftp server to the local destination
:param ftp_handle: an authenticated ftplib.FTP instance
:param path: the folder on the ftp server to download
:param destination: the local directory to store the copied folder
:param overwrite: set to True to force re-download of all files, even if they appear to exist already
:param guess_by_extension: It takes a while to explicitly check if every item is a directory or a file.
if this flag is set to True, it will assume any file ending with a three character extension ".???" is
a file and not a directory. Set to False if some folders may have a "." in their names -4th position.
"""
os.chdir(destination)
_mirror_ftp_dir(ftp_handle, path, overwrite, guess_by_extension)
this is an alternative. you can try using ftputil package. You can then use it to walk the remote directories and get your files
Using ftp.mlsd() instead of ftp.nlst():
import sys
import ftplib
import os
from ftplib import FTP
def fetchFiles(ftp, path, destination, overwrite=True):
'''Fetch a whole folder from ftp. \n
Parameters
----------
ftp : ftplib.FTP object
path : string ('/dir/folder/')
destination : string ('D:/dir/folder/') folder where the files will be saved
overwrite : bool - Overwrite file if already exists.
'''
try:
ftp.cwd(path)
os.mkdir(destination[:-1] + path)
print('New folder made: ' + destination[:-1] + path)
except OSError:
# folder already exists at the destination
pass
except ftplib.error_perm:
# invalid entry (ensure input form: "/dir/folder/")
print("error: could not change to " + path)
sys.exit("ending session")
# list children:
filelist = [i for i in ftp.mlsd()]
print('Current folder: ' + filelist.pop(0)[0])
for file in filelist:
if file[1]['type'] == 'file':
fullpath = os.path.join(destination[:-1] + path, file[0])
if (not overwrite and os.path.isfile(fullpath)):
continue
else:
with open(fullpath, 'wb') as f:
ftp.retrbinary('RETR ' + file[0], f.write)
print(file[0] + ' downloaded')
elif file[1]['type'] == 'dir':
fetchFiles(ftp, path + file[0] + '/', destination, overwrite)
else:
print('Unknown type: ' + file[1]['type'])
if __name__ == "__main__":
ftp = FTP('ftp address')
ftp.login('user', 'password')
source = r'/Folder/'
dest = r'D:/Data/'
fetchFiles(ftp, source, dest, overwrite=True)
ftp.quit()
Using ftputil, a fast solution could be:
def download(folder):
for item in ftp.walk(folder):
print("Creating dir " + item[0])
os.mkdir(item[0])
for subdir in item[1]:
print("Subdirs " + subdir)
for file in item[2]:
print(r"Copying File {0} \ {1}".format(item[0], file))
ftp.download(ftp.path.join(item[0],file), os.path.join(item[0],file))
It is non-trivial at least. In the simplest case, you only assume you have files and directories. This isn't always the case, there are softlinks and hardlinks and Windows-style shortcut. Softlink and directory shortcut are particularly problematic since they make recursive directory possible, which would confuse naive-ly implemented ftp grabber.
How would you handle such recursive directory depends on your need; you might simply not follow softlinks or you might try to detect recursive links. Detecting recursive link is inherently tricky, you cannot do it reliably.

Extract attachments from EML files

I currently use this code to Extract attachments from EML files. And I wanted to know if I can link the attachment to mail (EML file). That is, add the eml file name as the attachment name prefix.
So I can know the attachment belongs to what mail.
Thank You
import os, re
import email
import argparse
import olefile
def extractAttachment(msg, eml_files, output_path):
#print len(msg.get_payload())
#print msg.get_payload()
if len(msg.get_payload()) > 2:
if isinstance(msg.get_payload(), str):
try:
extractOLEFormat(eml_files, output_path)
except IOError:
#print 'Could not process %s. Try manual extraction.' % (eml_files)
#print '\tHeader of file: %s\n' % (msg.get_payload()[:8])
pass
elif isinstance(msg.get_payload(), list):
count = 0
while count < len(msg.get_payload()):
payload = msg.get_payload()[count]
#récupérer les pièces jointes
filename = payload.get_filename()
#os.rename(filename,'rrrrr'+filename)
#filename=os.path.join(str(filename), str(eml_files))
if filename is not None:
try:
magic = payload.get_payload(decode=True)[:4]
except TypeError:
magic = "None"
# Print the magic deader and the filename for reference.
printIT(eml_files, magic, filename)
# Write the payload out.
writeFile(filename, payload, output_path)
count += 1
elif len(msg.get_payload()) == 2:
payload = msg.get_payload()[1]
filename = payload.get_filename()
try:
magic = payload.get_payload(decode=True)[:4]
except TypeError:
magic = "None"
# Print the magic deader and the filename for reference.
printIT(eml_files, magic, filename)
# Write the payload out.
writeFile(filename, payload, output_path)
elif len(msg.get_payload()) == 1:
attachment = msg.get_payload()[0]
payload = attachment.get_payload()[1]
filename = attachment.get_payload()[1].get_filename()
try:
magic = payload.get_payload(decode=True)[:4]
except TypeError:
magic = "None"
# Print the magic deader and the filename for reference.
printIT(eml_files, magic, filename)
# Write the payload out.
writeFile(filename, payload, output_path)
#else:
# print 'Could not process %s\t%s' % (eml_files, len(msg.get_payload()))
def extractOLEFormat(eml_files, output_path):
data = '__substg1.0_37010102'
filename = olefile.OleFileIO(eml_files)
msg = olefile.OleFileIO(eml_files)
attachmentDirs = []
for directories in msg.listdir():
if directories[0].startswith('__attach') and directories[0] not in attachmentDirs:
attachmentDirs.append(directories[0])
for dir in attachmentDirs:
filename = [dir, data]
if isinstance(filename, list):
filenames = "/".join(filename)
filename = msg.openstream(dir + '/' + '__substg1.0_3707001F').read().replace('\000', '')
payload = msg.openstream(filenames).read()
magic = payload[:4]
# Print the magic deader and the filename for reference.
printIT(eml_files, magic, filename)
# Write the payload out.
writeOLE(filename, payload, output_path)
#filename = str(eml_files)+"--"+str(filename)
def printIT(eml_files, magic, filename):
filename = str(eml_files)+"--"+str(filename)
print ('Email Name: %s\n\tMagic: %s\n\tSaved File as: %s\n' % (eml_files, magic, filename))
def writeFile(filename, payload, output_path):
filename = str(eml_files)+"--"+str(filename)
try:
file_location = output_path + filename
open(os.path.join(file_location), 'wb').write(payload.get_payload(decode=True))
except (TypeError, IOError):
pass
def writeOLE(filename, payload, output_path):
open(os.path.join(output_path + filename), 'wb')
def main():
parser = argparse.ArgumentParser(description='Attempt to parse the attachment from EML messages.')
parser.add_argument('-p', '--path',default='C:\\Users\\hamd\\Desktop\\TEX\\emails' ,help='eml')#Path to EML files
parser.add_argument('-o', '--out', default='C:\\Users\\hamd\\Desktop\\TEX\\PJ\\eml_files\\',help='pj')#Path to write attachments to.
args = parser.parse_args()
if args.path:
input_path = args.path
else:
print ("You need to specify a path to your EML files.")
exit(0)
if args.out:
output_path = args.out
else:
print ("You need to specify a path to write your attachments to.")
exit(0)
for root, subdirs, files in os.walk(input_path):
for file_names in files:
eml_files = os.path.join(root, file_names)
msg = email.message_from_file(open(eml_files))
extractAttachment(msg, eml_files, output_path)
if __name__ == "__main__":
main()
I tried to write this as a comment but is was too long. I won't give a full blown solution, but I'll explain the idea.
A possible solution would be to create an hard link to the extracted attachment, giving to the hard link the same name of EML file. You can append an incremental suffix if you have more attachments in the same EML file:
whatever.eml (original email file)
whatever_001.attch (hard link to first extracted attachment)
whatever_002.attch (hard link to second extracted attachment)
...
This way:
you are free to move the extracted attachment anywhere else (but in the same disk, because hard links by definition work only on the same disk)
you can keep a copy of the attachment (the hard link) together with the EML file without consuming disk space
in case the extracted file is deleted you have a backup copy of the attachment (the hard links) without consuming disk space
In Python you can create an hard link simply with:
import os
os.link(existing_target_file, new_link_name)

Resources