Unable to return email message body - python-3.x

I have created a class to read emails and convert to a dataframe. This works for all HEADER data but I am unable to parse the message content and have tried numerous methods. I am following a tutorial from here http://beneathdata.com/how-to/email-behavior-analysis/
I have tried amending the def fetch_and_parse function in the code to select the message content but nothing seems to return. I have also tried amending the FETCH query but I'm lost.
from imaplib import IMAP4_SSL
import email as em
from email.utils import parsedate, parsedate_tz
from email.parser import HeaderParser
class OutlookAccount(object):
def __init__(self, username=None, password=None, folder=None):
self.username = username
self.password = password
self.folder = folder
def login(self):
self.conn = IMAP4_SSL('outlook.office365.com')
response = self.conn.login(self.username, self.password)
return response
def search(self, query, folder=None, readonly=False):
ff = self.folder if self.folder else folder
self.conn.select(ff, readonly)
resp, data = self.conn.search(None, query)
return data
def fetch(self, uids, query):
uid_arr = b','.join(uids[0].split())
resp, data = self.conn.fetch(uid_arr, query)
return data
def fetch_and_parse(self, uids, query):
data = self.fetch(uids, query)
parser = HeaderParser()
emails = []
for email in data:
if len(email) < 2:
continue
msg = em.message_from_bytes(email[1]).as_string()
emails.append(parser.parsestr(msg))
return emails
def load_parse_query(self, search_query, fetch_query, folder=None, readonly=False):
'''Perform search and fetch on an imap Gmail account. After fetching relevant info
from fetch query, parse into a dict-like email object, return list of emails.'''
uids = self.search(search_query, folder, readonly)
return self.fetch_and_parse(uids, fetch_query)
import numpy as np
import pandas as pd
import getpass
#import matplotlib.pyplot as plt
#import matplotlib.dates as dates
#import matplotlib.gridspec as gridspec
from datetime import timedelta, datetime, date
imap_password = getpass.getpass()
outlook = OutlookAccount(username='some#email.com', password=imap_password)
outlook.login()
daysback = 6000 # ~10yrs...make this whatever ya like
notsince = 0 # since now.
since = (date.today() - timedelta(daysback)).strftime("%d-%b-%Y")
before = (date.today() - timedelta(notsince)).strftime("%d-%b-%Y")
SEARCH = '(SENTSINCE {si} SENTBEFORE {bf})'.format(si=since, bf=before)
ALL_HEADERS = '(BODY.PEEK[HEADER])'
# Search and fetch emails!
received = outlook.load_parse_query(search_query=SEARCH,
fetch_query=ALL_HEADERS,
folder='"INBOX"')
#create function to convert to dataframe
def scrub_email(headers):
# IMAP sometimes returns fields with varying capitalization. Lowercase each header name.
return dict([(title.lower(), value) for title, value in headers])
df = pd.dataframe([scrub_email(email._headers) for email in received])
I want the dataframe to include all headers data and a field that includes the email message content/body.

Body needed to be selected as part of fetc_and_parse fusing something like:
if mime_msg.is_multipart():
for part in mime_msg.walk():
if part.is_multipart():
for subpart in part.get_payload():
if subpart.is_multipart():
for subsubpart in subpart.get_payload():
body = body + str(subsubpart.get_payload(decode=True)) + '\n'
else:
body = body + str(subpart.get_payload(decode=True)) + '\n'
else:
body = body + str(part.get_payload(decode=True)) + '\n'
else:
body = body + str(mime_msg.get_payload(decode=True)) + '\n'
body = bytes(body,'utf-8').decode('unicode-escape')

Related

Trying to retrieve videoid's from channel on YouTube Data API and their comments

I am trying take the list of videoids and extract the comments from those ids in a list. I am having trouble figuring out a way to loop through all of the videoids (I have been able to get one video, but it stops there)
Currently, I get an error saying that it can't find the videoId (it is not being passed)
import os
import googleapiclient.discovery
import sys
import csv
import re
import json
import pandas as pd
from datetime import datetime
from datetime import timedelta
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "CENSORED"
youtube = googleapiclient.discovery.build(
api_service_name, api_version, developerKey = DEVELOPER_KEY)
#Calculating timestamp for 2 weeks before this script is run
now = datetime.now()
weeks_calculated = now + timedelta(weeks=-2)
#Passing parameters to API to retrieve video ids for the past 2 weeks
def get_id(youtube,channelId):
response = youtube.search().list(
part="snippet",
type='video',
channelId=channelId,
publishedAfter='{}'.format(weeks_calculated.isoformat("T") + "Z")
#ublishedAfter="2021-08-10T00:00:00Z"
#rder="time",
#pageToken=pageToken
#maxResults=maxResults
).execute()
return response
#Passing parameters to API to retrieve comments by video id
def get_comments(youtube,videoId):
response = youtube.commentThreads().list(
part="snippet,replies",
videoId=videoId,
order="time",
#pageToken=pageToken,
textFormat="plainText",
moderationStatus="published",
#maxResults=maxResults
).execute()
return response
comment_text = []
video_id = []
def get_channel_data():
channels = [{"id":"UCQlVOYJyQp64rA12ac0mv6g"}]
for channel in channels:
video_data = get_id(youtube,channel['id'])
for i in video_data['items']:
videoData = i['id']['videoId']
video_id.append(videoData)
return video_id
def get_comment_data():
videoId = get_channel_data()
videoId = videoId
response = youtube.commentThreads().list(
part="snippet,replies",
videoId=videoId,
order="time",
textFormat="plainText",
moderationStatus="published"
#maxResults=maxResults
).execute()
while response:
for videoId[0] in response:
for item in response['items']:
original = item['snippet']['topLevelComment']['snippet']['textOriginal']
comment_text.append(original)
return comment_text
get_comment_data()

Get text to csv format using python

I am able to get the data from pdf to text.
But now i need to get the data in csv format with table structure.
I tried it to get the table structure with but it didn't happen.Any inputs?
Also, i'm able to generate it through json.
Is there a way to get the result into table csv format?
any inputs ?
Below is the code i have used.
import boto3
import time
# Document
s3BucketName = "textractanalysisexample"
documentName = "sheet_example.pdf"
def startJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def isJobComplete(jobId):
# For production use cases, use SNS based notification
# Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
return status
def getJobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
#print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
def lambda_handler(event, context):
jobId = startJob(s3BucketName, documentName)
#print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
response = getJobResults(jobId)
# Print detected text
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print (item["Text"]) ```
You can import CSV to write to a csv file like so:
import csv
with open('my_pdf.txt', 'r') as in_file:
stripped = (line.strip() for line in in_file)
lines = (line.split(",") for line in stripped if line)
with open('my_pdf.csv', 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('title', 'intro'))
writer.writerows(lines)
You can just put in the rows you need, and this splits your data into comma separated values. You can see more information for CSV writer (and csv python in general) here (Python Docs).

Getting Interactive Brokers API into Pandas

New to Python and IB API and stuck on this simple thing. This application works correctly and prints IB server reply. However, I cannot figure out how to get this data into a panda's dataframe or any other variable for that matter. How do you "get the data out?" Thanks!
Nothing on forums, documentation or youtube that I can find with a useful example. I think the answer must be to return accountSummary to pd.Series, but no idea how.
Expected output would be a data series or variable that can be manipulated outside of the application.
from ibapi import wrapper
from ibapi.client import EClient
from ibapi.utils import iswrapper #just for decorator
from ibapi.common import *
import pandas as pd
class TestApp(wrapper.EWrapper, EClient):
def __init__(self):
wrapper.EWrapper.__init__(self)
EClient.__init__(self, wrapper=self)
#iswrapper
def nextValidId(self, orderId:int):
print("setting nextValidOrderId: %d", orderId)
self.nextValidOrderId = orderId
# here is where you start using api
self.reqAccountSummary(9002, "All", "$LEDGER")
#iswrapper
def error(self, reqId:TickerId, errorCode:int, errorString:str):
print("Error. Id: " , reqId, " Code: " , errorCode , " Msg: " , errorString)
#iswrapper
def accountSummary(self, reqId:int, account:str, tag:str, value:str, currency:str):
print("Acct Summary. ReqId:" , reqId , "Acct:", account,
"Tag: ", tag, "Value:", value, "Currency:", currency)
#IB API data returns here, how to pass it to a variable or pd.series
#iswrapper
def accountSummaryEnd(self, reqId:int):
print("AccountSummaryEnd. Req Id: ", reqId)
# now we can disconnect
self.disconnect()
def main():
app = TestApp()
app.connect("127.0.0.1", 4001, clientId=123)
test = app.accountSummary
app.run()
if __name__ == "__main__":
main()
Hi had the same problem and collections did it for me. Here is my code for CFDs data. Maybe it will help somebody. You will have your data in app.df. Any suggestion for improvement are more than welcome.
import collections
import datetime as dt
from threading import Timer
from ibapi.client import EClient
from ibapi.wrapper import EWrapper
from ibapi.contract import Contract
import pandas as pd
# get yesterday and put it to correct format yyyymmdd{space}{space}hh:mm:dd
yesterday = str(dt.datetime.today() - dt.timedelta(1))
yesterday = yesterday.replace('-','')
IP = '127.0.0.1'
PORT = 7497
class App(EClient, EWrapper):
def __init__(self):
super().__init__(self)
self.data = collections.defaultdict(list)
def error(self, reqId, errorCode, errorString):
print(f'Error {reqId}, {errorCode}, {errorString}')
def historicalData(self, reqId, bar):
self.data['date'].append(bar.date)
self.data['open'].append(bar.open)
self.data['high'].append(bar.high)
self.data['low'].append(bar.low)
self.data['close'].append(bar.close)
self.data['volume'].append(bar.volume)
self.df = pd.DataFrame.from_dict(self.data)
def stop(self):
self.done = True
self.disconnect()
# create App object
app = App()
print('App created...')
app.connect(IP, PORT, 0)
print('App connected...')
# create contract
contract = Contract()
contract.symbol = 'IBDE30'
contract.secType = 'CFD'
contract.exchange = 'SMART'
contract.currency = 'EUR'
print('Contract created...')
# request historical data for contract
app.reqHistoricalData(reqId=1,
contract=contract,
endDateTime=yesterday,
durationStr='1 W',
barSizeSetting='15 mins',
whatToShow='ASK',
useRTH=0,
formatDate=1,
keepUpToDate=False,
chartOptions=[])
Timer(4, app.stop).start()
app.run()
I'd store the data to a dictionary, create a dataframe from the dictionary, and append the new dataframe to the main dataframe using the concat function. Here's an example:
def accountSummary(self, reqId:int, account:str, tag:str, value:str, currency:str):
acct_dict = {"account": account, "value": value, "currency": currency}
acct_df = pd.DataFrame([acct_dict], columns=acct_dict.keys())
main_df = pd.concat([main_df, acct_df], axis=0).reset_index()
For more information, you might like Algorithmic Trading with Interactive Brokers

How can I make the phantomJS webdriver to wait until a specific HTML element being loaded and then return the page.source?

I have developed the code below for a web crawling object.
It takes two dates as inputs.Then creates a list of dates between these two dates and attach each one to a webpage url which contains weather information of a location. Then it converts HTML tables of data into Dataframe and after that stores data as csv file in storage (the base link is: https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/2019-1-3 and as you can see in this example the date is 2019-1-3):
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
html = self.driver.page_source
return html
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
This is the way I want to use the WebCrawler object:
date1 = date(2018, 12, 29)
date2 = date(2019, 1, 1)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in dates:
print('**************************')
print('PROCESSING : ', day)
link = crawler.create_link(day)
print('WAITING... ')
time.sleep(3)
print('VISIT WEBPAGE ... ')
html = crawler.open_link(link)
print('DATA RETRIEVED ... ')
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
The problem which occurs is that the first iteration of loop runs perfect but the second one stops with an error which says No tables where found (occurs in table = soup.find("table",{"class":"tablesaw-sortable"}) line) and that's because page source is returned by WebCrawler.open_link before the webpage fully load the contents of webpage including the table (containing weather information). there is also a probability that website rejects the request because it's making the servers too busy.
Is there anyway that we could build a loop that keep trying to open the link until when it could find the table, or at least wait until table is loaded and then return the table?
You can have selenium wait for a specific element. In your case it will be the table with the class name of "tablesaw-sortable". I highly recommend that you use CSS selectors to find this element, as it's fast and less error prone that getting all table elements.
Here is the CSS selector, premade for you table.tablesaw-sortable. Set selenium to wait until that element has loaded.
Source: https://stackoverflow.com/a/26567563/4159473
I rewrote the code using the https://stackoverflow.com/a/26567563/4159473 solution which was suggested by #mildmelon and I also used some delays between each time sending request to server and asking for the page source:
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.delay_for_page = 7
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
myElem = WebDriverWait(self.driver, self.delay_for_page)\
.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable')))
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
date1 = date(2019, 2, 1)
date2 = date(2019, 3, 5)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in few_dates:
print('**************************')
print('DATE : ', day)
link = crawler.create_link(day)
print('WAITING ....')
print('')
time.sleep(12)
print('OPENING LINK ... ')
try:
crawler.open_link(link)
html = crawler.driver.page_source
print( "DATA IS FETCHED")
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
except TimeoutException:
print( "NOT FETCHED ...!!!")
The weather information is fetched without problem. I guess delays between each request resulted in better performance. The line myElem = WebDriverWait(self.driver, self.delay_for_page)\.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable'))) has also improved speed.

PYTHON - How can I check that I'm calling this API correctly?

Not really sure what's going wrong with my code, I'm trying to call the openrates.io API and passing in two variables, the date and the base currency - which are both accepted variables when calling the API, it will change the URL when calling the API - eg. the desired URL is :
http://api.openrates.io/2018-11-22?base=EUR
I then try to pass the results of that URL, which is a JSON output, into a class and then extract individual key:value pairs from other functions.
This is my code so far:
import datetime
import requests
import sys
import json
class APIError(Exception):
"""
APIError represents an error returned by the OpenRates API.
"""
pass
class Currency(object):
"""
Currency object will store all currency related data such as present rates and historical rates
"""
# def __init__(self, today_rate, yesterday_rate, lastweek_rate):
def __init__(self, exchange_rates):
# Store the raw data from the exchange rate API for today's rates - this will be used in the average rate calculations
self.currency_data = exchange_rates
class Time(object):
"""
Here are all the time related elements we will need for this program to work
"""
def __init__(self):
# Call the API here for use in the Currency object later
self.url = "http://api.openrates.io/"
def call_api(self, **kwargs):
"""
Openrates API takes in Base Currency, Dates and Destination Currency as parameters, individually, however,
not altogther
"""
# Call the API by requesting the URL. Use `json()` to decode the raw JSON data.
response_data = requests.get(self.url, kwargs).json()
# Check for an error and throw an exception if needed.
if "Error" in response_data:
raise APIError(response_data["Error"])
# Return the decoded data.
return response_data
def get_today(self, base_curr):
# Call today's exchange rate API
# date = datetime.date.today(
# today_rate = self.call_api(str(date), base=base_curr)
# today_rate = self.call_api(str(date))
self.url = "http://api.openrates.io/latest"
today_rate = self.call_api(base=base_curr)
return Currency(today_rate)
def get_yesterday(self, base_curr):
# Call today's exchange rate API
date = datetime.date.today() - datetime.timedelta(days = 1)
self.url = "http://api.openrates.io/" + str(date)
yesterday_rate = self.call_api(base=base_curr)
# print(yesterday_rate)
#yesterday_rate = self.call_api(str(date),
return Currency(yesterday_rate)
def get_last_week(self, base_curr):
#Return last week's datetime
date = datetime.date.today() - datetime.timedelta(days = 7)
self.url = "http://api.openrates.io/" + str(date)
lastweek_rate = self.call_api(base=base_curr)
return Currency(lastweek_rate)
def home():
# Get the conversion amount from the website.
conv_amt = input("conv_amt")
print("Amount to change is:", conv_amt)
base = input("base_curr")
print("Changing from:", base)
dest = input("dest_curr")
print("Changing to:", dest)
#Create Time client
time = Time()
if base:
try:
pony = time.get_today(base)
today_amt = conv_amt * pony.currency_data["rates"][dest]
except APIError:
pony = "(error)"
yesterpony = time.get_yesterday(base)
yesterday_amt = conv_amt * yesterpony.currency_data["rates"][dest]
lastweekpony = time.get_last_week(base)
lastweek_amt = conv_amt * lastweekpony.currency_data["rates"][dest]
return today_amt, yesterday_amt, lastweek_amt
if __name__ == "__main__":
home()
If I try running this, I get an error:
"today_amt = conv_amt * pony.currency_data["rates"][dest]
TypeError: 'NoneType' object is not subscriptable"
Does anyone have any idea of where the issue is? Is it not able to call the API or something wrong with the way I call my Class object?

Resources