How To Read pdf attachement from email in python - python-3.x

I am trying to Extract a Pdf atttachment from my email and trying to print it but it gives me really bad outputs.Is there any way i can extract only the text and numbers from it?
Heres my code
import imaplib
import email
import os
import io
svdir = 'P:\\'
mail = imaplib.IMAP4_SSL('imap.gmail.com',993)
mail.login("example#gmail.com", "examplepassword")
mail.select("Inbox")
typ, msgs = mail.search(None, '(SUBJECT "samplepdf")')
msgs = msgs[0].split()
print(msgs)
for emailid in msgs:
resp, data = mail.fetch(emailid, "(RFC822)")
email_body = data[0][1]
m = email.message_from_bytes(email_body)
if m.get_content_maintype() != 'multipart':
continue
for part in m.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
filename = part.get_filename()
fg=part.get_payload(decode=True)
print(fg)
Im Sorry i cant post the output here because its like really long and i cant even copy it because it makes my PC lag.
Thanks in advance.

Python doesn't have any built in packages to read PDF files. You'll need to use a PDF parsing library, for example https://pypi.org/project/PyPDF2/.
Hope this helps.

Related

Why are my PDF files not working when downloading them as email attachments?

I'm trying to download PDF attachments in specific emails that I receive using imaplib in Python. The code itself seems to be working fine. All the documents I want are being downloaded just the way I want them to, but all the downloaded PDF documents simply don't work. I can't open them, and they don't seem to work with PdfReader (Python module). They are also larger in file size than they should be. Why is this happening, and how can I fix it?
Here is the code I have:
import imaplib
import base64
import os
import email
user = 'my email'
passwd = 'my password'
directory = 'directory I want my downloaded files to go to'
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(user, passwd)
mail.select('INBOX')
res, data = mail.search(None, 'FROM "email I want to download attachments from" SUBJECT "subject I want"')
mail_ids = data[0]
id_list = mail_ids.split()
for num in data[0].split():
res, data = mail.fetch(num, '(RFC822)')
raw_email = data[0][1]
raw_email_string = raw_email.decode('utf-8')
email_message = email.message_from_string(raw_email_string)
for part in email_message.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(directory, fileName)
if not os.path.isfile(filePath):
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
Whenever I try to open any of the PDF files (using Zathura) I get the following messages in my terminal:
error: cannot recognize version marker
warning: trying to repair broken xref
warning: repairing PDF document
warning: name is too long
warning: ...repeated 1376 times...
error: no objects found
error: could not open document
Also, opening the PDF files in a browser does not work either. I tried opening them with Google Chrome and I simply get a message that says:
Error
Failed to load PDF document.
Any help would be greatly appreciated.
I managed to get it working, now my PDF documents can be opened and they also work with PdfReader. Here's how the code looks like now that it is working:
import imaplib
import os
import email
user = 'email'
passwd = 'password'
directory = 'path to directory I want my downloaded files to go to'
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login(user, passwd)
mail.select('INBOX')
res, data = mail.search(None, 'FROM "email I want to download attachments from" SUBJECT "Subject filter"')
mail_ids = data[0]
id_list = mail_ids.split()
for num in data[0].split():
res, data = mail.fetch(num, '(RFC822)')
raw_email = email.message_from_bytes(data[0][1])
for part in raw_email.walk():
if part.get_content_maintype() == 'multipart':
continue
if part.get('Content-Disposition') is None:
continue
fileName = part.get_filename()
if bool(fileName):
filePath = os.path.join(directory, fileName)
if not os.path.isfile(filePath):
fp = open(filePath, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
Turns out the base64 module was not even needed since part of what broke my PDF files was decoding the byte data to UTF-8

Python - How can I send email with attachments in Python using gmail?

As you may well know, some mail addresses need to turn off security for less secure apps in gmail.
Turning off options works like a charm with smtplib and attachments, but without turning off it don't works at all.
Then I discovered an API way using Auth 2.0 in ezgmail module, and it can send emails very easy, but however the attachments are not attached well. they have some problem at encoded, because they don't display well the documents.
The code I ended up with is:
import ezgmail
import os, sys, glob
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
pathFile = '<path>'
folder = os.getcwd()
def compose():
# email_from = '<my email>'
subject = 'Hello,'
body = '''Here I put the text I want.'''
return [subject, body]
def send_email(email_to):
subject, body = compose()
ezgmail.EMAIL_ADDRESS = '<my email>' # email_from
os.chdir(pathFile)
list_pdfs = glob.glob('*.pdf')
file1 = max(list_pdfs, key=os.path.getctime) # select latest file
# this should do the *encode part*
attachment = open(file1, 'rb')
file = MIMEBase('application','octet-stream')
file.set_payload((attachment).read())
encoders.encode_base64(file)
os.chdir(folder)
ezgmail.send(email_to, subject, body, file.as_string())
print("Mail sent")
send_email(to#example.com)
And the question it's : how to attach properly documents (pdf, word, excel) to ezgmail ?
The attachment(s) need to be a string (or list of strings) representing the path(s) to the document(s):
def send_email(email_to):
subject, body = compose()
ezgmail.EMAIL_ADDRESS = '<my email>' # email_from
os.chdir(pathFile)
list_pdfs = glob.glob('*.pdf')
file1 = max(list_pdfs, key=os.path.getctime) # select latest file
os.chdir(folder)
ezgmail.send(email_to, subject, body, os.sep.join([pathFile, file1]))
print("Mail sent")
The encoding is a separate issue.
The author says:
EZGmail isn't meant to be comprehensive and do everything the Gmail API lets
you do, it's meant to make the simple things simple
It does attempt to do the MIME conversion, but there is nothing specific for pdf files
the following code needs to be inserted into the EZgmail module in the _createMessageWithAttachments method:
elif sub_type == 'pdf':
mimePart = MIMEBase('application','octet-stream')
mimePart.set_payload((fp).read())
encoders.encode_base64(mimePart)
and you need to import encoders from email

Search for an email and then fetch its content(s) with Python 3+ imaplib module

I'm trying to make something that finds an email by its title and then searches its content for something.
import email, imaplib, re, quopri
username = ""
password = ""
mail = imaplib.IMAP4_SSL("imap.mail.yahoo.com", port=993)
mail.login(username, password)
mail.list()
success, emai = mail.search(None, '(SUBJECT "check this out")')
for x in emai[0].split():
mai, dat = mail.fetch(x, "(RFC822)")
decoded = quopri.decodestring(dat[0][1].decode())
email_msg = email.message_from_string(decoded)
break
link = re.search(r'test1 (.*?) test2', str(email_msg))
print(link)
Obviously my code is messy, but this sample should show everything that's needed to help me. For example, I sent myself an email named "check this out" with "test1 hit test2" as its content. However I've been struggling to figure stuff out, therefore I'm asking you guys for help. How do I search for an email and then fetch its content?
Thank you!
for x in emai[0].split():
mai, dat = mail.fetch(x, "(RFC822)")
email_msg = email.message_from_string(dat[0][1])
if email_msg.is_multipart():
for payload in email_msg.get_payload(decode=True):
link = re.search(r'test1 (.*?) test2', payload.get_payload(decode=True))
print(link)
else:
link = re.search(r'test1 (.*?) test2', email_msg.get_payload(decode=True))
print(link)
break

Python3 - Web Scraper w/ BeautifulSoup, Email Price Alert

I'm attempting to setup a script to email me when a phone meets my price and condition threshold on swappa.com. I've gotten to the point when I can email out the latest post that meets my criteria, but I've hit a bit of wall. I need it to only email me when a new post meets my criteria. Currently, it will email me each time the script is ran, whether or not the listing is new.
I'm very new to Python and spent the better part of yesterday and today hobbling together this script, but any insight or information that would help accomplish my goal would be much appreciated!
from bs4 import BeautifulSoup
import re
import requests
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
# define the url in a variable and get the content
page_url = 'https://swappa.com/buy/google-pixel-2-xl-unlocked?sort=date_min'
page_content = requests.get(page_url)
# Put the content into a variable using BeautifulSoup
swappa = BeautifulSoup(page_content.text, 'html.parser')
# Pull text from the h1 tag
phone_name = swappa.h1
# Pull the device details into a list
condition = swappa.find(class_='condition_label')
color = swappa.find(class_='color_label')
storage = swappa.find(class_='storage_label')
price = swappa.find(class_='price')
links = []
for link in swappa.findAll('a', attrs={'href': re.compile("^/listing")}):
links.append(link.get('href'))
listing_detail = [
phone_name.contents[0], condition.contents[0], color.contents[0], storage.contents[0],
'$' + price.contents[1], 'https://swappa.com' + links[0]
]
def listing_result():
if (price.contents[1] <= '420') and (condition.contents[0] == 'Good' or condition.contents[0] == 'Mint'):
return listing_detail
with open("result.txt", "w+") as result_file:
result_file.write(str(listing_result()))
with open('result.txt', 'r') as result_file_read:
result_data = result_file_read.read().replace('\n', '')
if result_data != result_file:
# setup sending an email via SMTP
server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server.login("redacted", "redacted")
# use mime to send email
msg = MIMEMultipart()
# setup parameters of the email
msg['From'] = 'redacted'
msg['To'] = 'redacted'
msg['Subject'] = 'Swappa - New Pixel 2 XL Listing'
# add in the message body
body = str(listing_result())
body = MIMEText(body) # convert the body to a MIME compatible string
msg.attach(body) # attach it to your main message
# send the email
server.send_message(msg)
del msg
My idea was to have the latest listing be sent to a text file, then have the contents of that text file saved to a variable so that I could compare it to the text being created. However, I'm at a loss for how to accomplish that.

How to get the body text of email with imaplib?

I am in python3.4 .
import imaplib
import email
user="XXXX"
password="YYYY"
con=imaplib.IMAP4_SSL('imap.gmail.com')
con.login(user,password)
con.list()
con.select("INBOX")
result,data=con.fetch(b'1', '(RFC822)')
raw=email.message_from_bytes(data[0][1])
>>> raw["From"]
'xxxx'
>>> raw["To"]
'python-list#python.org'
>>> raw["Subject"]
'Re:get the min date from a list'
When i run 'print(raw)' there are many lines of the body of the email ,
i can't get it with raw[TEXT] OR raw['TEXT'] OR raw['BODY'] ,
how can i get the body of the email text?
You're asking it for a header named TEXT or BODY, and obviously there is no such thing. I think you're mixing up IMAP4 part names (the things you pass in con.fetch) and RFC2822 header names (the things you use in an email.message.Message).
As the email.message documentation explains, a Message consists of headers and a payload. The payload is either a string (for non-multipart messages) or a list of sub-Messages (for multipart). Either way, what you want here is raw.get_payload().
If you want to handle both, you can either first check raw.is_multipart(), or you can check the type returned from get_payload(). Of course you have to do decide what you want to do in the case of a multipart message; what counts as "the body" when there are three parts? Do you want the first? The first text/plain? The first text/*? The first text/plain if there is one, the first text/* if not, and the first of anything if even that doesn't exist? Or all of them concatenated together?
Let's assume you just want the first one. To do that:
def get_text(msg):
if msg.is_multipart():
return get_text(msg.get_payload(0))
else:
return msg.get_payload(None, True)
If you want something different, hopefully you can figure out how to do it yourself. (See the get_content_type and/or get_content_maintype methods on Message.)
Following up using Python 3.8 - parses all the parts that have an associated encoding and turns it into a single HTML page
import imaplib
import email
import webbrowser
import tempfile
import webbrowser
def email_to_html(parsed):
all_parts = []
for part in parsed.walk():
if type(part.get_payload()) == list:
for subpart in part.get_payload():
all_parts += email_to_html(subpart)
else:
if encoding := part.get_content_charset():
all_parts.append(part.get_payload(decode=True).decode(encoding))
return ''.join(all_parts)
# Login
imap = imaplib.IMAP4_SSL("imap.gmail.com")
result = imap.login("username", "password")
# Select the inbox, grab only the unseen emails
status, resp = imap.select('INBOX')
status, response = imap.search(None, '(UNSEEN)')
unread_msg_nums = response[0].split()
email_bodies = []
for idx in unread_msg_nums:
_, msg = imap.fetch(str(int(idx)), "(RFC822)")
for response in msg:
if isinstance(response, tuple):
raw_email = response[1]
parsed = email.message_from_bytes(raw_email)
email_bodies.append(email_to_html(parsed))
# If you want to view/check the emails in your browser
def display(html):
with tempfile.NamedTemporaryFile('w', delete=False, suffix='.html') as f:
url = 'file://' + f.name
f.write(html)
webbrowser.open(url)
for body in email_bodies:
display(body)

Resources