How to parse eml file and extract meta-data informations - python-3.x

I have an eml file with some attachments. I want to read text content in eml file and I want to extract meta-data information like(sender, from, cc, bcc, subject). Also I want to download the attachments as well. With the help of the below code I am only able to extract information/ text content in the body of the email.
import email
from email import policy
from email.parser import BytesParser
import glob
file_list = glob.glob('*.eml') # returns list of files
with open(file_list[2], 'rb') as fp: # select a specific email file from the list
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
print(text)
There was module name emaildata which was available for Python 2 did the job.
Extracting MetaData Informations
import email
from emaildata.metadata import MetaData
message = email.message_from_file(open('message.eml'))
extractor = MetaData(message)
data = extractor.to_dict()
print data.keys()
Extracting Attachment Information
import email
from emaildata.attachment import Attachment
message = email.message_from_file(open('message.eml'))
for content, filename, mimetype, message in Attachment.extract(message):
print filename
with open(filename, 'w') as stream:
stream.write(content)
# If message is not None then it is an instance of email.message.Message
if message:
print "The file {0} is a message with attachments.".format(filename)
But this library is now deprecated and is of now use. Is there any other library that could extract the meta-data and attachment related information?

Meta-data information could be accessed using below code in Python 3.x
from email import policy
from email.parser import BytesParser
with open(eml_file, 'rb') as fp:
msg = BytesParser(policy=policy.default).parse(fp)
print('To:', msg['to'])
print('From:', msg['from'])
print('Subject:', msg['subject'])
Remaining header informations could be accessed using msg.keys()
For downloading attachments from an eml file you can use the below code:
import sys
import os
import os.path
from collections import defaultdict
from email.parser import Parser
eml_mail = 'your eml file'
output_dir = 'mention the directory where you want the files to be download'
def parse_message(filename):
with open(filename) as f:
return Parser().parse(f)
def find_attachments(message):
"""
Return a tuple of parsed content-disposition dict, message object
for each attachment found.
"""
found = []
for part in message.walk():
if 'content-disposition' not in part:
continue
cdisp = part['content-disposition'].split(';')
cdisp = [x.strip() for x in cdisp]
if cdisp[0].lower() != 'attachment':
continue
parsed = {}
for kv in cdisp[1:]:
key, val = kv.split('=')
if val.startswith('"'):
val = val.strip('"')
elif val.startswith("'"):
val = val.strip("'")
parsed[key] = val
found.append((parsed, part))
return found
def run(eml_filename, output_dir):
msg = parse_message(eml_filename)
attachments = find_attachments(msg)
print ("Found {0} attachments...".format(len(attachments)))
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for cdisp, part in attachments:
cdisp_filename = os.path.normpath(cdisp['filename'])
# prevent malicious crap
if os.path.isabs(cdisp_filename):
cdisp_filename = os.path.basename(cdisp_filename)
towrite = os.path.join(output_dir, cdisp_filename)
print( "Writing " + towrite)
with open(towrite, 'wb') as fp:
data = part.get_payload(decode=True)
fp.write(data)
run(eml_mail, output_dir)

Have a look at: ParsEML it bulk extracts attachments from all eml files in a directory (originally from Stephan Hügel). And i used a modified version of MeIOC to easily extract all metadata in json format; if you want i can share that to.

Related

How to extract combined email ids with same domain from pdf file using python

I want to extract all email ids from pdf file. I used this below code and its working fine for first email id and separate email format.
from io import StringIO
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams
from pdfminer3.pdfpage import PDFPage
import re
pagenums = set()
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open("/home/administrator/Desktop/email.pdf", 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
match = re.search(r'[\w\.-]+#[a-z0-9\.-]+', text)
# match = re.search(r'[\w\.-]+#[\w\.-]+', text)
email = match.group(0)
print (email)
But I want to extract all email ids and combined mail id format like {abc,xyz}#gmail.com. How to extract like this email id format and all ids.

Get text to csv format using python

I am able to get the data from pdf to text.
But now i need to get the data in csv format with table structure.
I tried it to get the table structure with but it didn't happen.Any inputs?
Also, i'm able to generate it through json.
Is there a way to get the result into table csv format?
any inputs ?
Below is the code i have used.
import boto3
import time
# Document
s3BucketName = "textractanalysisexample"
documentName = "sheet_example.pdf"
def startJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def isJobComplete(jobId):
# For production use cases, use SNS based notification
# Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
return status
def getJobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
#print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
def lambda_handler(event, context):
jobId = startJob(s3BucketName, documentName)
#print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
response = getJobResults(jobId)
# Print detected text
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print (item["Text"]) ```
You can import CSV to write to a csv file like so:
import csv
with open('my_pdf.txt', 'r') as in_file:
stripped = (line.strip() for line in in_file)
lines = (line.split(",") for line in stripped if line)
with open('my_pdf.csv', 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('title', 'intro'))
writer.writerows(lines)
You can just put in the rows you need, and this splits your data into comma separated values. You can see more information for CSV writer (and csv python in general) here (Python Docs).

How to download all attachments of a mail using python IMAP

I need to download all the attachments from a particular mail in outlook.
The below code is working fine if there is single attachment but when the mail has multiple attachment, it only download one.
Can anyone help me regarding this? Thanks.
I'm running this on python 3.7.
import imaplib
import email
import os
server =imaplib.IMAP4_SSL('outlook.office365.com',993)
server.login('Email id','Password')
server.select()
typ, data = server.search(None, '(SUBJECT "Subject name")')
mail_ids = data[0]
id_list = mail_ids.split()
for num in data[0].split():
typ, data = server.fetch(num, '(RFC822)' )
raw_email = data[0][1]
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join('C:\\Users\\lokesing\\', fileName)
if not os.path.isfile(filePath) :
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
server.logout
print("Attachment downloaded from mail")
The output should be all attachments downloaded to my system at defined path.
You may use imap_tools package:
https://pypi.org/project/imap-tools/
from imap_tools import MailBox, Q
# get all attachments for each email from INBOX folder
with MailBox('imap.mail.com').login('test#mail.com', 'password') as mailbox:
for msg in mailbox.fetch():
for att in msg.attachments:
print(att.filename, att.content_type)
with open('C:/1/{}'.format(att.filename), 'wb') as f:
f.write(att.payload)
Here is an example of Downloading Attachments from Outlook Emails using Python.
I used the library called: exchangelib.
https://medium.com/#theamazingexposure/accessing-shared-mailbox-using-exchangelib-python-f020e71a96ab
Here is the Code Snippet:
from exchangelib import Credentials, Account, FileAttachment
import os.path
from pathlib import Path
credentials = Credentials('Firstname.Lastname#someenterprise.com', 'Your_Password_Here')
account = Account('shared_mail_box_name#someenterprise.com', credentials=credentials, autodiscover=True)
filtered_items = account.inbox.filter(subject__contains='Data is ready for')
print("Getting latest email...")
for item in account.inbox.filter(subject__contains='Data is ready for').order_by('-datetime_received')[:1]:
print(item.subject, item.sender, item.datetime_received)
for attachment in item.attachments:
if isinstance(attachment, FileAttachment):
filepath = os.path.join('C:\\path\\to\\your\\directory', attachment.name) #this part will download the attachment to local file system
with open(filepath, 'wb') as f:
f.write(attachment.content)
print('Saved attachment to:', filepath)

Unicode characters to Turkish characters

(Edit: my original question is posted here, but the issue has been resolved and the code below is correct). I am looking for advice on how to convert Unicode characters to Turkish characters. The following code (posted online) scrapes tweets for an individual user and outputs a csv file, but the Turkish characters come out as in Unicode characters, i.e. \xc4. I am using Python 3 on a mac.
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
import tweepy #https://github.com/tweepy/tweepy
import csv
import string
import print
#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#print "getting tweets before %s" % (oldest)
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text)] for tweet in alltweets]
write the csv
with open('%s_tweets.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(["id","created_at","text"])
writer.writerows(outtweets)
pass
if __name__ == '__main__':
pass in the username of the account you want to download
get_all_tweets("")
The csv module docs recommend you specify the encoding when you open the file. (and also that you use newline='' so the CSV module can do its own handling for newlines). Don't encode Unicode strings when writing rows.
import csv
with open('test.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['id','created_at','text'])
writer.writerows([[123, 456, 'Äβç']])

How convert an mbox to a JSON structure?

I am trying to convert an mbox to a JSON structure suitable for import into MongoDB i.e.
I am using mining social web second edition mailbox chapter but its not working properly.
I am trying to convert an mbox to a JSON structure suitable for import into MongoDB i.e.
I am using mining social web second edition mailbox chapter but its not working properly.
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = 'resources/ch06-mailboxes/data/enron.mbox'
OUT_FILE = MBOX + '.json'
def cleanContent(msg):
# Decode message from "quoted printable" format, but first
# re-encode, since decodestring will try to do a decode of its own
msg = quopri.decodestring(msg.encode('utf-8'))
# Strip out HTML tags, if any are present.
# Bail on unknown encodings if errors happen in BeautifulSoup.
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
# The generator itself...
def gen_json_msgs(mb):
while 1:
msg = mb.next()
if msg is None:
break
yield jsonifyMessage(msg)
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
# The To, Cc, and Bcc fields, if present, could have multiple items.
# Note that not all of these fields are necessarily defined.
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() != 'text':
print >> sys.stderr, "Skipping MIME content in JSONification
({0})".format(part.get_content_maintype())
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
json_part['content'] = cleanContent(content)
json_msg['parts'].append(json_part)
# Finally, convert date from asctime to milliseconds since epoch using the
# $date descriptor so it imports "natively" as an ISODate object in MongoDB
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
# Write each message out as a JSON object on a separate line
# for easy import into MongoDB via mongoimport
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
print "All done"
getting error:
80 # for easy import into MongoDB via mongoimport
81
---> 82 f = open(OUT_FILE, 'w')
83 for msg in gen_json_msgs(mbox):
84 if msg != None:
IOError: [Errno 13] Permission denied: 'resources/ch06-mailboxes/data/enron.mbox.json'
The code you mentioned became obsolete in Third Edition of Mining Social Web
I tried making a workable script that not just converts MBOX to JSON, but even extracts the Attachments to usable formats.
Link to the repo -
https://github.com/PS1607/mbox-to-json
Read the README file for usage instructions.
It seems that your problem is related to user permissions instead of Python. Line 82 tries to open a file in the "data" folder, but permission was denied. You should try executing your script using the sudo command from a terminal:
sudo python3 <your script name>
This should take care of the error you pointed out.
PS: Python 3 uses print as a function; line 88 should read
print('All done')

Resources