Parsing Attached .MSG Files with Python3 - python-3.x

I'm trying to monitor a phishing inbox that could receive both normal emails (i.e. HTML/text based with potential attachments) as well as emails that have a .MSG file attached to it.
The goal is to have users send emails to phishing#company.com and once I parse out the various links (potentially malicious) as well as attachments (also potentially malicious, I'll perform some analysis on them.
The issue I'm running into is the body of the .msg file that is attached.
With the code below, I'm able to pull the to, from, subject, and all links within the original email. It also pulls down any attachments with the .msg file (i.e. on my test I was able to pull down a PDF within the .msg). However, I cannot get any of the to, from, subject, or body of the .msg file.
When I print it out as raw I get some of it in a very ugly format, but apparently with the multi-parts, I'm doing something wrong to get that piece of information.
I'm fairly new to Python so any help would be greatly appreciated.
import imaplib
import base64
import os
import email
from bs4 import BeautifulSoup
server = 'mail.server.com'
email_user = 'phishing#company.com'
email_pass = 'XXXXXXXXXXXX'
output_dir = '/tmp/attachments/'
body = ""
def get_body(msg):
if msg.is_multipart():
return get_body(msg.get_payload(0))
else:
return msg.get_payload(None, True)
def get_attachments(msg):
for part in msg.walk():
if part.get_content_maintype()=='multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(output_dir, fileName)
with open(filePath,'wb') as f:
f.write(part.get_payload(decode=True))
mail = imaplib.IMAP4_SSL(server)
mail.login(email_user, email_pass)
mail.select('INBOX')
result, data = mail.search(None, 'UNSEEN')
mail_ids = data[0]
id_list = mail_ids.split()
print(id_list)
for emailid in id_list:
result, email_data = mail.fetch(emailid, '(RFC822)')
raw_email = email_data[0][1]
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
email_from = str(email.header.make_header(email.header.decode_header(email_message['From'])))
email_to = str(email.header.make_header(email.header.decode_header(email_message['To'])))
subject = str(email.header.make_header(email.header.decode_header(email_message['Subject'])))
print('From: ' + email_from)
print('To: ' + email_to)
print('Subject: ' + subject)
get_attachments(raw_email)
for part in email_message.walk():
body = part.get_payload(0)
content = body.get_payload(decode=True)
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find_all('a'):
print('Link: ' + link.get('href'))
break

I got this working with the following code. I basically had to do multiple for loops within the .msg walk and then only pull out the relevant information within the text/html sections.
for emailid in id_list:
result, data = mail.fetch(emailid, '(RFC822)')
raw = email.message_from_bytes(data[0][1])
get_attachments(raw)
#print(raw)
header_from = mail.fetch(emailid, "(BODY[HEADER.FIELDS (FROM)])")
header_from_str = str(header_from)
mail_from = re.search('From:\s.+<(\S+)>', header_from_str)
header_subject = mail.fetch(emailid, "(BODY[HEADER.FIELDS (SUBJECT)])")
header_subject_str = str(header_subject)
mail_subject = re.search('Subject:\s(.+)\'\)', header_subject_str)
#mail_body = mail.fetch(emailid, "(BODY[TEXT])")
print(mail_from.group(1))
print(mail_subject.group(1))
for part in raw.walk():
if part.get_content_type() == 'message/rfc822':
part_string = str(part)
original_from = re.search('From:\s.+<(\S+)>\n', part_string)
original_to = re.search('To:\s.+<(\S+)>\n', part_string)
original_subject = re.search('Subject:\s(.+)\n', part_string)
print(original_from.group(1))
print(original_to.group(1))
print(original_subject.group(1))
if part.get_content_type() == 'text/html':
content = part.get_payload(decode=True)
#print(content)
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find_all('a'):
print('Link: ' + link.get('href'))

Related

Looping through a file list to email attachments

The section of Python code below is working for me to send a single attachment to an email when FileList = "/Users/jamescook/Documents/MailTest/MC70165.pdf".
with open(FileList, "rb") as attachment:
part = MIMEBase("application", "octet-stream")
part.set_payload(attachment.read())
encoders.encode_base64(part)
part.add_header(
"Content-Disposition",
f"attachment; filename= {FileList}",
)
msg.attach(part)
text = message.as_string()
But I need to sometimes have multiple attachments. Between indenting and where a loop would end, I've been unable to successfully loop through when FileList = '/Users/jamescook/Documents/MailTest/MC70165.pdf', '/Users/jamescook/Documents/MailTest/MT40125.pdf','/Users/jamescook/Documents/MailTest/ReadMe.txt'.

How can I return a string from a Google BigQuery row iterator object?

My task is to write a Python script that can take results from BigQuery and email them out. I've written a code that can successfully send an email, but I am having trouble including the results of the BigQuery script in the actual email. The query results are correct, but the actual object I am returning from the query (results) always returns as a Nonetype.
For example, the email should look like this:
Hello,
You have the following issues that have been "open" for more than 7 days:
-List issues here from bigquery code
Thanks.
The code reads in contacts from a contacts.txt file, and it reads in the email message template from a message.txt file. I tried to make the bigquery object into a string, but it still results in an error.
from google.cloud import bigquery
import warnings
warnings.filterwarnings("ignore", "Your application has authenticated using end user credentials")
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from string import Template
def query_emailtest():
client = bigquery.Client(project=("analytics-merch-svcs-thd"))
query_job = client.query("""
select dept, project_name, reset, tier, project_status, IssueStatus, division, store_number, top_category,
DATE_DIFF(CURRENT_DATE(), in_review, DAY) as days_in_review
from `analytics-merch-svcs-thd.MPC.RESET_DETAILS`
where in_review IS NOT NULL
AND IssueStatus = "In Review"
AND DATE_DIFF(CURRENT_DATE(), in_review, DAY) > 7
AND ready_for_execution IS NULL
AND project_status = "Active"
AND program_name <> "Capital"
AND program_name <> "SSI - Capital"
LIMIT 50
""")
results = query_job.result() # Waits for job to complete.
return results #THIS IS A NONETYPE
def get_queryresults(results): #created new method to put query results into a for loop and store it in a variable
for i,row in enumerate(results,1):
bq_data = (i , '. ' + str(row.dept) + " " + row.project_name + ", Reset #: " + str(row.reset) + ", Store #: " + str(row.store_number) + ", " + row.IssueStatus + " for " + str(row.days_in_review)+ " days")
print (bq_data)
def get_contacts(filename):
names = []
emails = []
with open(filename, mode='r', encoding='utf-8') as contacts_file:
for a_contact in contacts_file:
names.append(a_contact.split()[0])
emails.append(a_contact.split()[1])
return names, emails
def read_template(filename):
with open(filename, 'r', encoding='utf-8') as template_file:
template_file_content = template_file.read()
return Template(template_file_content)
names, emails = get_contacts('mycontacts.txt') # read contacts
message_template = read_template('message.txt')
results = query_emailtest()
bq_results = get_queryresults(query_emailtest())
import smtplib
# set up the SMTP server
s = smtplib.SMTP(host='smtp-mail.outlook.com', port=587)
s.starttls()
s.login('email', 'password')
# For each contact, send the email:
for name, email in zip(names, emails):
msg = MIMEMultipart() # create a message
# bq_data = get_queryresults(query_emailtest())
# add in the actual person name to the message template
message = message_template.substitute(PERSON_NAME=name.title())
message = message_template.substitute(QUERY_RESULTS=bq_results) #SUBSTITUTE QUERY RESULTS IN MESSAGE TEMPLATE. This is where I am having trouble because the Row Iterator object results in Nonetype.
# setup the parameters of the message
msg['From']='email'
msg['To']='email'
msg['Subject']="This is TEST"
# body = str(get_queryresults(query_emailtest())) #get query results from method to put into message body
# add in the message body
# body = MIMEText(body)
#msg.attach(body)
msg.attach(MIMEText(message, 'plain'))
# query_emailtest()
# get_queryresults(query_emailtest())
# send the message via the server set up earlier.
s.send_message(msg)
del msg
Message template:
Dear ${PERSON_NAME},
Hope you are doing well. Please find the following alert for Issues that have been "In Review" for greater than 7 days.
${QUERY_RESULTS}
If you would like more information, please visit this link that contains a complete dashboard view of the alert.
ISE Services
The BQ result() function returns a generator, so I think you need to change your return to yield from.
I'm far from a python expert, but the following pared-down code worked for me.
from google.cloud import bigquery
import warnings
warnings.filterwarnings("ignore", "Your application has authenticated using end user credentials")
def query_emailtest():
client = bigquery.Client(project=("my_project"))
query_job = client.query("""
select field1, field2 from `my_dataset.my_table` limit 5
""")
results = query_job.result()
yield from results # NOTE THE CHANGE HERE
results = query_emailtest()
for row in results:
print(row.field1, row.field2)

Download one specific Outlook attachment from an email with multiple attachments

Working on a Python 3 win32com.client script that only searches for emails from a specific sender and downloads 1 out of multiple attachments.
The issue I have is that in the instances where an email has two attachments, it tries to download and rename both, which overwrites the first file that I want with the second attachment.
The file attachment has a specific filename but there is another attachment with a similar name.
So far I have:
import win32com.client
import os
mydesktop = os.path.expanduser('~') + '/Desktop/'
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
# Select main Inbox
inbox = outlook.GetDefaultFolder(6)
messages = inbox.Items
sender = 'mysender#domain'
MyDailyfolder = mydesktop + 'My Daily Data/'
try:
for message in messages:
msg_date = message.SentOn.strftime('%Y-%m-%d')
try:
s = message.sender
s = str(s)
if s == sender:
for att in message.Attachments:
if "Dashboard2_dashboard2" in att.FileName: #<---This doesn't work.
outfile_name2 = 'MycustomName' + msg_date + '.csv'
outfile_path2 = MyDailyfolder + outfile_name2
if not os.path.exists(MyDailyfolder): os.makedirs(MyDailyfolder)
# save file
att.SaveASFile(outfile_path2)
print('Saved file:', outfile_name2)
except:
x=1
except:
x=1
The result downloads all of the attached files in an email to a new folder on my desktop but it overwrites each copy. I'm trying to select only the attachment that contains "Dashboard2dashboard2" on it. I think I have to use "for part in msg.walk():" but have never used that command before. Any ideas?
Ah the issue is a typo in my filename search. An extra underscore. Added an extra print in each step to make sure each part is valid.
sender = 'mysender#domain'
MyDailyfolder = mydesktop + 'My Daily Data/'
try:
for message in messages:
msg_date = message.SentOn.strftime('%Y-%m-%d')
try:
s = message.sender
s = str(s)
if s == sender:
print('Sender:' , message.sender)
for att in message.Attachments:
if "dashboard_2.csv" in att.FileName:
outfile_name = msg_date + att.FileName
print ('Match search confirmed')
# Backup test, print filename and watch spelling
print (att.FileName)
# Create a folder and copy/paste attachment there
outfile_path = MyDailyfolder + outfile_name
if not os.path.exists(MyDailyfolder): os.makedirs(MyDailyfolder)
# save file
att.SaveASFile(outfile_path)
except:
pass
except:
pass

Python's Email Message library output not getting accepted by Outlook 365 when i have a named attachments from

I've created a sample function to test sending emails with an attached html file, which i intend to use for reporting on automated test runs in the future (replacing an existing external powershell script). Note that I'm attaching the html file, not using the html as inline text in the body. I'm using our company's mailgun smtp account service to send the email.
I seem to have an issue with Outlook 365 (web hosted - uses the outlook.office.com domain) either rejecting or blocking the sent email, but interestingly the same email is received and accepted by my personal hotmail address (outlook.live.com domain). I've found Outlook 365 blocks or does not accept the email when I attempt to name the file in the email message object. But if I don't name it, it will come through (with a default name of "ATT00001.htm" ).
My code for this is below but they key line seems to be
msg.add_attachment(open_file.read(), maintype='text', subtype='html', filename=filename)
If I drop the filename key it works (but with a default assigned filename) e.g.
msg.add_attachment(open_file.read(), maintype='text', subtype='html')
I have a suspicion there is something in the attachment's header or Content-disposition that Outlook 365 doesn't agree with, but i'm not sure what it is or how to work around.
I'm using the following (Python 3.6.5, on Windows 10 machine, smtplib and email.message seem to be built in)
Here is the code:
import smtplib
from email.message import EmailMessage
import os
def send_mail():
MAILGUN_SMTP_LOGIN = "<my company's mailgun login>"
MAILGUN_SMTP_PASSWORD = "<my company's mailgun password>"
fromaddr = "muppet#sharklasers.com" # the from address seems to be inconsequential
toaddr = ['me#mycompanysdomainusingoffice365.com.au', 'me#hotmail.com']
msg = EmailMessage()
msg.preamble = 'This is preamble. Not sure where it should show in the email'
msg['From'] = fromaddr
msg['To'] = ', '.join(toaddr)
msg['Subject'] = 'Testing attached html results send'
msg.set_content(""" This is a test of attached html """)
filename = 'api_automatedtests_20180903_1341.html'
filepath = os.path.abspath('D:/work/temp/api_automatedtests_20180903_1341.html')
open_file = open(filepath, "rb")
# msg.make_mixed()
msg.add_attachment(open_file.read(), maintype='text', subtype='html', filename=filename)
# msg.add_attachment(open_file.read(), maintype='text', subtype='html')
server = smtplib.SMTP(host="smtp.mailgun.org", port=587)
server.ehlo()
server.starttls()
server.login(MAILGUN_SMTP_LOGIN, MAILGUN_SMTP_PASSWORD)
server.set_debuglevel(1)
server.send_message(msg)
server.quit()
if __name__ == "__main__":
send_mail()
What I've tried
Tried sending with the same code using a textfile (with appropriate types). e.g.
msg.add_attachment(open_file.read(), maintype='text', subtype='plain', filename=filename)
Result: This works as expected (comes through with the given name - the filename is a string variable e.g. testfile.txt)
adding msg.make_mixed() to make sure it is identified as a multipart message. Result: No effect
Turning on the smtp debug level 1, Result: Mailgun says that everything has worked fine (and the messages do appear as expected in my hotmail account)
Not using the filename key in the msg.add_attachment call.
Result: This works the attachment comes through at ATT00001.htm
Interestingly the default name is *.htm while the filename I'm trying to use is *.html
Tried using a filename with *.htm and a subtype of 'htm' (instead of html)
Result: Same as for html (received on hotmail but not on outlook 365)
Tried using the generic types of maintype=''application', subtype='octet-stream'.
e.g. msg.add_attachment(open_file.read(), maintype='application', subtype='octet-stream', filename=filename)
Result: Same as for html (received on hotmail but not on outlook 365)
Tried using mimetypes.guess as shown in this link
https://docs.python.org/3.6/library/email.examples.html
ctype, encoding = mimetypes.guess_type(path)
if ctype is None or encoding is not None:
# No guess could be made, or the file is encoded (compressed), so
# use a generic bag-of-bits type.
ctype = 'application/octet-stream'
maintype, subtype = ctype.split('/', 1)
with open(path, 'rb') as fp:
msg.add_attachment(fp.read(),
maintype=maintype,
subtype=subtype,
filename=filename)
Result: It's determined as maintype='text', subtype='html' and I get the same result as with my original code (ie arrives in hotmail but blocked by 365).
Checking my spam and clutter folders - was not there
Any suggestions on why the use of filename would be breaking it?
Update
After sending to a other email addresses with various providers I discovered:
1) muppet#sharklasers.com was not a trusted sender (can change this)
2) I discovered the attachment was being flagged as unsafe. The html file comes from pytest's html report with the single file option. It contains javascript for row expanders. Gmail warns the attachment may not be safe (office 365 just straight out blocks the email altogether).
Not sure how to work around 2). I can email the same file to myself between outlook 365 and gmail and vice versa and the file doesn't get blocked. It only get's blocked when I use the above script using python's libraries and Mailgun SMTP. I suspect there is something I need to change in the email header to get around this. But I don't know what.
There seems to be some connection between trying to add the filename and the attachment being marked as unsafe
Okay I figured it out. The problem was the content-type needed to include "name=filename" in it's value.
Also I needed to use maintype='multipart', subtype='mixed'.
I have 2 solutions.
solution 1
import smtplib
from email.message import EmailMessage
import os
def send_mail(body_text, fromaddr, recipient_list, smtp_login, smtp_pass, file_path):
msg = EmailMessage()
msg.preamble = 'This is preamble. Not sure where it should show'
msg['From'] = fromaddr
msg['To'] = ', '.join(recipient_list)
msg['Subject'] = 'API Testing results'
msg.set_content(body_text)
filename = os.path.basename(file_path)
open_file = open(file_path, "rb")
msg.add_attachment(open_file.read(), maintype='multipart', subtype='mixed; name=%s' % filename, filename=filename)
server = smtplib.SMTP(host="smtp.mailgun.org", port=587)
server.ehlo()
server.starttls()
server.login(smtp_login, smtp_pass)
server.send_message(msg)
server.quit()
if __name__ == "__main__":
smtp_login = "<my smtp login>"
smtp_pass = "<my smtp password>"
recipient_list = ['user1#mycompany.com.au', 'user2#mycompany.com.au']
file_path = os.path.abspath('D:/work/temp/api_automatedtests_20180903_1341.html')
body_text = "test results for 03/09/2018 "
fromaddr = 'autotesting#mycompany.com.au'
send_mail(body_text=body_text, recipient_list=recipient_list, smtp_login=smtp_login, smtp_pass=smtp_pass,
file_path=file_path)
solution 2 (according to the documentation using the email.mime libraries is a legacy solution and the EmailMessage method is supposed to be used in preference.
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
import os
def send_mail(body_text, fromaddr, recipient_list, smtp_login, smtp_pass, file_path):
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['To'] = ', '.join(recipient_list)
msg['Subject'] = "Sending API test results"
msg.attach(MIMEText(body_text, 'plain'))
filename = os.path.basename(file_path)
attachment = open(file_path, "rb")
part = MIMEBase('multipart', 'mixed; name=%s' % filename)
part.set_payload(attachment.read())
encoders.encode_base64(part)
part.add_header('Content-Disposition', "attachment; filename= %s" % filename)
msg.attach(part)
server = smtplib.SMTP(host="smtp.mailgun.org", port=587)
server.starttls()
server.login(smtp_login, smtp_pass)
text = msg.as_string()
server.set_debuglevel(1)
server.sendmail(fromaddr, recipient_list, text)
server.quit()
if __name__ == '__main__':
smtp_login = "<my smtp login>"
smtp_pass = "<my smtp password>"
recipient_list = ['user1#mycompany.com.au', 'user2#mycompany.com.au']
file_path = os.path.abspath('D:/work/temp/api_automatedtests_20180903_1341.html')
body_text = " Api test results for 03/09/2018 "
fromaddr = "autotest#mycompany.com.au"
send_mail(body_text=body_text, fromaddr=fromaddr, recipient_list=recipient_list, smtp_login=smtp_login, smtp_pass=smtp_pass,
file_path=file_path)

Python 3 - How to extract only email body

I have this code to extract email body, but the output show the message and some encrypted information. I need help to get only the message.
Last version i've try the lib imaplib, but i don´t have sucess because all message it´s encrypted, so i change to poplib.
As future updates i want to add Subject, date and sender
#!/usr/bin/env python3
# -- coding: utf-8 --
import email
import poplib
login = input('Email: ')
password = input('Password: ')
pop_server = 'pop-mail.outlook.com'
pop_port = 995
mail_box = poplib.POP3_SSL(pop_server, pop_port)
mail_box.user(login)
mailbox.pass_(password)
numMessages = len(mail_box.list()[1])
if numMessages > 15:
numMessages = 15
for i in range(15):
(server_msg, body, octets) = mail_box.retr(i+1)
for j in body:
try:
msg = email.message_from_string(j.decode("utf-8"))
strtext = msg.get_payload()
print(strtext)
except:
pass
If a is the raw email string:
msg = email.message_from_string(a)
if msg.is_multipart():
for part in msg.walk():
payload = part.get_payload(decode=True) #returns a bytes object
strtext = payload.decode() #utf-8 is default
print(strtext)
else:
payload = msg.get_payload(decode=True)
strtext = payload.decode()
print(strtext)

Resources