Send xlsx file using SES in AWS lambda function - python-3.x

SITUATION
I have created a lambda function the output of which is a an Excel file that gets saved to an S3 bucket. This part of the function works as expected.
As part of the functions operation I would also like to be able to email the generated Excel file to selected recipients.
CODE:
#IMPORT MODULES
import boto3
import pandas as pd
import io
from io import BytesIO
from io import StringIO
from datetime import date
import email
import email.header
import email.policy
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
def lambda_handler(event, context):
# GENERATE CURRENT DATE TO APPEND TO FILE
today = date.today()
date_val = today.strftime("%B %d, %Y")
# CREATE DATAFRAME
df = pd.DataFrame({'Data': [10, 22, 31, 43, 57, 99, 65, 74, 88]})
# EVALUATE VARIABLES AS ODD OR EVEN INTEGERS
even = df.loc[df['Data']%2 == 0]
odd = df.loc[df['Data']%2 != 0]
# SPECIFY BUKCET NAME AND OUTPUT FILE PATH
bucket = 'my-bucket'
filepath = 'output/My_Excel_File_{}.xlsx'.format(date_val)
# EXPORT MULTI-SHEET EXCEL FILE AND SEND TO S3 BUCKET
with io.BytesIO() as output:
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
even.to_excel(writer, sheet_name = 'Even')
odd.to_excel(writer, sheet_name = 'Odd')
data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket(bucket).put_object(Key=filepath, Body=data)
WHAT I HAVE TRIED
I have tried to achieve my aim by appending the following code to my function by referring to various documentation, however this does not achieve the desired result.
# EXPORT MULTI-SHEET EXCEL FILE AND SEND TO S3 BUCKET
message = MIMEMultipart()
message['Subject'] = 'Email subject'
message['From'] = 'sender.email#domain.com'
message['To'] = 'recipient.email#domain.com')
# MESSAGE BODY
part = MIMEText('Thus is the email body string', 'html')
message.attach(part)
# ATTACHEMENT
if attachment_string: # if bytestring available
part = MIMEApplication(str.encode('attachment_string'))
else: # if file provided
part = MIMEApplication(s3.get_object(Bucket='my-bucket', Key=My_Excel_File_{}.xlsx'.format(date_val)).read())
part.add_header('Content-Disposition', 'attachment', filename='My_Excel_File_{}.xlsx'.format(date_val)')
message.attach(part)
response = client.send_raw_email(
Source=message['From'],
Destinations=['recipient.email#domain.com'],
RawMessage={
'Data': message.as_string()
}
)

There are AWS examples that dynamically create excel docs and email them. In this use cases, they are implemented in Java and the app is a web app. See this:
Creating the DynamoDB web application item tracker
Although this example uses the AWS SDK for Java V2, it will give you an idea and hopefully you can port to the programming language you are using.

Related

Header is repeating when merging multiple files using Python shell in AWS Glue

I am new to Python and AWS Glue.
I am trying to merge few excel files in a S3 source bucket and generate 1 output file (csv) in a target S3 bucket. I am able to read and generate the output file with merged data but the only problem is that the header is repeating from each file.
Can someone help to debug to remove the repeating headers?
Below is my code:
import pandas as pd
import glob
import xlrd
import openpyxl
import boto3
import io
import json
import os
from io import StringIO
import numpy as np
s3 = boto3.resource('s3')
bucket = s3.Bucket('test bucket')
prefix_objs = bucket.objects.filter(Prefix='source/file')
prefix_df = []
for obj in prefix_objs:
key = obj.key
print(key)
temp = pd.read_excel(obj.get()['Body'], encoding='utf8')
prefix_df.append(temp)
bucket = 'test bucket'
csv_buffer = StringIO()
for current_df in prefix_df:
current_df.to_csv(csv_buffer, index = None)
print(current_df)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'merge.csv').put(Body=csv_buffer.getvalue())
Please help!
Regards,
Vijay
Change this line and add the parameter header.
temp = pd.read_excel(obj.get()['Body'], encoding='utf8')
to
temp = pd.read_excel(obj.get()['Body'], encoding='utf8', header=1)
or
temp = pd.read_excel(obj.get()['Body'], encoding='utf8', skiprows=1)
You need to test the header value, because sometimes the header starts not in the first row.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

How to parse eml file and extract meta-data informations

I have an eml file with some attachments. I want to read text content in eml file and I want to extract meta-data information like(sender, from, cc, bcc, subject). Also I want to download the attachments as well. With the help of the below code I am only able to extract information/ text content in the body of the email.
import email
from email import policy
from email.parser import BytesParser
import glob
file_list = glob.glob('*.eml') # returns list of files
with open(file_list[2], 'rb') as fp: # select a specific email file from the list
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
print(text)
There was module name emaildata which was available for Python 2 did the job.
Extracting MetaData Informations
import email
from emaildata.metadata import MetaData
message = email.message_from_file(open('message.eml'))
extractor = MetaData(message)
data = extractor.to_dict()
print data.keys()
Extracting Attachment Information
import email
from emaildata.attachment import Attachment
message = email.message_from_file(open('message.eml'))
for content, filename, mimetype, message in Attachment.extract(message):
print filename
with open(filename, 'w') as stream:
stream.write(content)
# If message is not None then it is an instance of email.message.Message
if message:
print "The file {0} is a message with attachments.".format(filename)
But this library is now deprecated and is of now use. Is there any other library that could extract the meta-data and attachment related information?
Meta-data information could be accessed using below code in Python 3.x
from email import policy
from email.parser import BytesParser
with open(eml_file, 'rb') as fp:
msg = BytesParser(policy=policy.default).parse(fp)
print('To:', msg['to'])
print('From:', msg['from'])
print('Subject:', msg['subject'])
Remaining header informations could be accessed using msg.keys()
For downloading attachments from an eml file you can use the below code:
import sys
import os
import os.path
from collections import defaultdict
from email.parser import Parser
eml_mail = 'your eml file'
output_dir = 'mention the directory where you want the files to be download'
def parse_message(filename):
with open(filename) as f:
return Parser().parse(f)
def find_attachments(message):
"""
Return a tuple of parsed content-disposition dict, message object
for each attachment found.
"""
found = []
for part in message.walk():
if 'content-disposition' not in part:
continue
cdisp = part['content-disposition'].split(';')
cdisp = [x.strip() for x in cdisp]
if cdisp[0].lower() != 'attachment':
continue
parsed = {}
for kv in cdisp[1:]:
key, val = kv.split('=')
if val.startswith('"'):
val = val.strip('"')
elif val.startswith("'"):
val = val.strip("'")
parsed[key] = val
found.append((parsed, part))
return found
def run(eml_filename, output_dir):
msg = parse_message(eml_filename)
attachments = find_attachments(msg)
print ("Found {0} attachments...".format(len(attachments)))
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for cdisp, part in attachments:
cdisp_filename = os.path.normpath(cdisp['filename'])
# prevent malicious crap
if os.path.isabs(cdisp_filename):
cdisp_filename = os.path.basename(cdisp_filename)
towrite = os.path.join(output_dir, cdisp_filename)
print( "Writing " + towrite)
with open(towrite, 'wb') as fp:
data = part.get_payload(decode=True)
fp.write(data)
run(eml_mail, output_dir)
Have a look at: ParsEML it bulk extracts attachments from all eml files in a directory (originally from Stephan Hügel). And i used a modified version of MeIOC to easily extract all metadata in json format; if you want i can share that to.

how to download files from s3 bucket based on files modified date?

I want to download files from a particular s3 bucket based on files Last modified date.
I have researched on how to connect boto3 and there is plenty of code and documentation available for downloading the file without any conditions. I made a pseudo code
def download_file_s3(bucket_name,modified_date)
# connect to reseource s3
s3 = boto3.resource('s3',aws_access_key_id='demo', aws_secret_access_key='demo')
# connect to the desired bucket
my_bucket = s3.Bucket(bucket_name)
# Get files
for file in my_bucket.objects.all():
I want to complete this function, basically, passing a modified date the function returns the files in the s3 bucket for that particular modified date.
I have a Better solution or a function which could do this automatically. Just pass In the Bucket name and Download path name.
from boto3.session import Session
from datetime import date, timedelta
import boto3
import re
def Download_pdf_specifc_date_subfolder(bucket_name,download_path)
ACCESS_KEY = 'XYZ'
SECRET_KEY = 'ABC'
Bucket_name=bucket_name
# code to create a session
session = Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
bucket = s3.Bucket(Bucket_name)
# code to get the yesterdays date
yesterday = date.today() - timedelta(days=1)
x=yesterday.strftime('20%y-%m-%d')
print(x)
#code to add the files to a list which needs to be downloaded
files_to_downloaded = []
#code to take all the files from s3 under a specific bucket
for fileObject in bucket.objects.all():
file_name = str(fileObject.key)
last_modified=str(fileObject.last_modified)
last_modified=last_modified.split()
if last_modified[0]==x:
# Enter the specific bucketname in the regex in place of Airports to filter only the particluar subfolder
if re.findall(r"Airports/[a-zA-Z]+", file_name):
files_to_downloaded.append(file_name)
# code to Download into a specific Folder
for fileObject in bucket.objects.all():
file_name = str(fileObject.key)
if file_name in files_to_downloaded:
print(file_name)
d_path=download_path + file_name
print(d_path)
bucket.download_file(file_name,d_path)
Download_pdf_specifc_date_subfolder(bucket_name,download_path)
Ultimately the function will give the results in the specific Folder with the files to be downloaded.
Here is my test code and it will print the last_modified datetime of objects which have the datetime after what I set.
import boto3
from datetime import datetime
from datetime import timezone
s3 = boto3.resource('s3')
response = s3.Bucket('<bucket name>').objects.all()
for item in response:
obj = s3.Object(item.bucket_name, item.key)
if obj.last_modified > datetime(2019, 8, 1, 0, 0, 0, tzinfo=timezone.utc):
print(obj.last_modified)
If you have a specific date, then
import boto3
from datetime import datetime, timezone
s3 = boto3.resource('s3')
response = s3.Bucket('<bucket name>').objects.all()
date = '20190827' # input('Insert Date as a form YYYYmmdd')
for item in response:
obj = s3.Object(item.bucket_name, item.key)
if obj.last_modified.strftime('%Y%m%d') == date:
print(obj.last_modified)
will give the results as follows.
2019-08-27 07:13:04+00:00
2019-08-27 07:13:36+00:00
2019-08-27 07:13:39+00:00
If edited this answer to download all files after a certain timestamp and then write the current time to a file for use in the next iteration. You can easily adapt this to only download files of a specific date, month, year, yesterday, etc.
import os
import boto3
import datetime
import pandas as pd
### Load AWS Key, Secret and Region
# ....
###
# Open file to read last download time and update file with current time
latesttime_file = "latest request.txt"
with open(latesttime_file, 'r') as f:
latest_download = pd.to_datetime(f.read(), utc=True)
with open(latesttime_file, 'w') as f:
f.write(str(datetime.datetime.utcnow()))
# Initialize S3-client
s3_client = boto3.client('s3',
region_name=AWS_REGION,
aws_access_key_id=AWS_KEY_ID,
aws_secret_access_key=AWS_SECRET)
def download_dir(prefix, local, bucket, timestamp, client=s3_client):
"""
params:
- prefix: pattern to match in s3
- local: local path to folder in which to place files
- bucket: s3 bucket with target contents
- client: initialized s3 client object
"""
keys = []
dirs = []
next_token = ''
base_kwargs = {
'Bucket':bucket,
'Prefix':prefix,
}
while next_token is not None:
kwargs = base_kwargs.copy()
if next_token != '':
kwargs.update({'ContinuationToken': next_token})
results = client.list_objects_v2(**kwargs)
contents = results.get('Contents')
for i in contents:
k = i.get('Key')
t = i.get('LastModified')
if k[-1] != '/':
if t > timestamp:
keys.append(k)
else:
dirs.append(k)
next_token = results.get('NextContinuationToken')
for d in dirs:
dest_pathname = os.path.join(local, d)
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
for k in keys:
dest_pathname = os.path.join(local, k)
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
client.download_file(bucket, k, dest_pathname)
download_dir(<prefix or ''>, <local folder to download to>, <bucketname>, latest_download)

Write Pandas Dataframe to_csv StringIO instead of file

Objective of this code is to read an existing CSV file from a specified S3 bucket into a Dataframe, filter the dataframe for desired columns, and then write the filtered Dataframe to a CSV object using StringIO that I can upload to a different S3 bucket.
Everything works right now except the code block for the function "prepare_file_for_upload". Below is the full code block:
from io import StringIO
import io #unsued at the moment
import logging
import pandas as pd
import boto3
from botocore.exceptions import ClientError
FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
#S3 parameters
source_bucket = 'REPLACE'
source_folder = 'REPLACE/'
dest_bucket = 'REPLACE'
dest_folder = 'REPLACE'
output_name = 'REPLACE'
def get_file_name():
try:
s3 = boto3.client("s3")
logging.info(f'Determining filename from: {source_bucket}/{source_folder}')
bucket_path = s3.list_objects(Bucket=source_bucket, Prefix=source_folder)
file_name =[key['Key'] for key in bucket_path['Contents']][1]
logging.info(file_name)
return file_name
except ClientError as e:
logging.info(f'Unable to determine file name from bucket {source_bucket}/{source_folder}')
logging.info(e)
def get_file_data(file_name):
try:
s3 = boto3.client("s3")
logging.info(f'file name from get data: {file_name}')
obj = s3.get_object(Bucket=source_bucket, Key=file_name)
body = obj['Body']
body_string = body.read().decode('utf-8')
file_data = pd.read_csv(StringIO(body_string))
#logging.info(file_data)
return file_data
except ClientError as e:
logging.info(f'Unable to read {file_name} into datafame')
logging.info(e)
def filter_file_data(file_data):
try:
all_columns = list(file_data.columns)
columns_used = ('col_1', 'col_2', 'col_3')
desired_columns = [x for x in all_columns if x in columns_used]
filtered_data = file_data[desired_columns]
logging.info(type(filtered_data)) #for testing
return filtered_data
except Exception as e:
logging.info('Unable to filter file')
logging.info(e)
The block below is where I am attempting to write the existing DF that was passed to the function using "to_csv" method with StringIO instead of creating a local file. to_csv will write to a local file but does not work with buffer (yes, I tried putting the buffer cursor to start position after and still nothing)
def prepare_file_for_upload(filtered_data): #this is the function block where I am stuck
try:
buffer = StringIO()
output_name = 'FILE_NAME.csv'
#code below is writing to file but can not get to write to buffer
output_file = filtered_data.to_csv(buffer, sep=',')
df = pd.DataFrame(buffer) #for testing
logging.info(df) #for testing
return output_file
except Exception as e:
logging.info(f'Unable to prepare {output_name} for upload')
logging.info(e)
def upload_file(adjusted_file):
try:
#dest_key = f'{dest_folder}/{output_name}'
dest_key = f'{output_name}'
s3 = boto3.resource('s3')
s3.meta.client.upload_file(adjusted_file, dest_bucket, dest_key)
except ClientError as e:
logging.info(f'Unable to upload {output_name} to {dest_key}')
logging.info(e)
def execute_program():
file_name = get_file_name()
file_data = get_file_data(file_name)
filtered_data = filter_file_data(file_data)
adjusted_file = prepare_file_for_upload(filtered_data)
upload_file = upload_file(adjusted_file)
if __name__ == '__main__':
execute_program()
Following solution worked for me:
csv_buffer = StringIO()
output_file = filtered_data.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object(dest_bucket, output_name).put(Body=csv_buffer.getvalue())
When working with a BytesIO object, pay careful attention to the order of operations. In your code, you instantiate the BytesIO object and then fill it via a call to to_csv(). So far so good. But one thing to manage when working with a BytesIO object that is different from a file workflow is the stream position.
After writing data to the stream, the stream position is at the end of the stream. If you try to write from that position, you will likely write nothing! The operation will complete leaving you scratching your head why no results are written to S3. Add a call to seek() with the argument 0 to your function. Here is a demo program that demonstrates:
from io import BytesIO
import boto3
import pandas
from pandas import util
df = util.testing.makeMixedDataFrame()
s3_resource = boto3.resource("s3")
buffer = BytesIO()
df.to_csv(buffer, sep=",", index=False, mode="wb", encoding="UTF-8")
# The following call to `tell()` returns the stream position. 0 is the beginning of the file.
df.tell()
>> 134
# Reposition stream to the beginning by calling `seek(0)` before uploading
df.seek(0)
s3_r.Object("test-bucket", "test_df_from_resource.csv").put(Body=buffer.getvalue())
You should get a response similar to the following (with actual values)
>> {'ResponseMetadata': {'RequestId': 'request-id-value',
'HostId': '###########',
'HTTPStatusCode': 200,
'HTTPHeaders': {'x-amz-id-2': '############',
'x-amz-request-id': '00000',
'date': 'Tue, 31 Aug 2021 00:00:00 GMT',
'x-amz-server-side-encryption': 'value',
'etag': '"xxxx"',
'server': 'AmazonS3',
'content-length': '0'},
'RetryAttempts': 0},
'ETag': '"xxxx"',
'ServerSideEncryption': 'value'}
Changing the code to move the stream position should solve the issues you were facing. It is also worth mentioning, Pandas had a bug that caused unexpected behavior when writing to a bytes object. It was fixed and the sample I provided assumes you are running a version of Python greater than 3.8 and a version of Pandas greater than 1.3.2. Further information on IO can be found in the python documentation.

How to import xlsx into dynamodb using python and boto3

Trying to use LinuxAcademy posting of how to import Excel data into DynamoDB but the code posting is two years old and does not work. Any tips or suggestions would be very helpful.
Sorry I'm new to stackoverflow.
I was trying to take an excel spreadsheet and convert it to json then upload to DynamoDB like the posting on LinuxAcademy. The instructions are old and they use three scripts to upload one file.
Here is the code I used to create a lambda AWS python function.
The only problem is that it reads in the excel file and converts it to json and the file is too big to ingest into DynamoDB before the 5 minute timeout. I will probably convert it to step functions but this worked for me.
import boto3
import os
import sys
import uuid
import pandas as pd
s3_client = boto3.client('s3')
bucket = "serverless-record-storage-lambda"
def upload_to_dynamodb(report):
df=pd.read_excel(report)
df.columns=["APPLICATION", "FORM_NUMBER", "FILE_DATE", "STATUS_DATE", "STATUS", "STATUS_CODE", "EXPIRATION_DATE", "ESTIMATED COST", "REVISED_COST", "EXISTING_USE", "EXISTING_UNITS", "PROPOSED_USE","PROPOSED_UNITS","PLANSETS", "15_DAY_HOLD?" , "EXISTING_STORIES", "PROPOSED_STORIES", "ASSESSOR_STORIES", "VOLUNTARY", "PAGES", "BLOCK", "LOT", "STREET_NUMBER", "STREET_NUMBER_SFX", "AVS_STREET_NAME", "AVS_STREET_SFX", "UNIT", "UNIT_SFX", "FIRST_NAME", "LAST_NAME", "CONTRACTORPHONE",
"COMPANY_NAME", "STREET_NUMBER", "STREET", "STREET_SUFFIX", "CITY", "STATE", "ZIP_CODE", "CONTACT_NAME", "CONTACT_PHONE", "DESCRIPTION" ]
# Clean-up the data, change column types to strings to be on safer side :)
df=df.replace({'-': '0'}, regex=True)
df=df.fillna(0)
for i in df.columns:
df[i] = df[i].astype(str)
# Convert dataframe to list of dictionaries (JSON) that can be consumed by any no-sql database
myl=df.T.to_dict().values()
# Connect to DynamoDB using boto
resource = boto3.resource('dynamodb', region_name='us-west-2')
# Connect to the DynamoDB table
table = resource.Table('permitdata')
# Load the JSON object created in the step 3 using put_item method
for permit in myl:
table.put_item(Item=permit)
def handler(event, context):
for record in event['Records']:
print(record)
bucket = record['s3']['bucket']['name']
print(bucket)
key = record['s3']['object']['key']
print(key)
download_path = '/tmp/{}{}'.format(uuid.uuid4(), key)
upload_path = '/tmp/resized-{}'.format(key)
s3_client.download_file(bucket, key, download_path)
upload_to_dynamodb(download_path)
def main():
handler(event, None)
if __name__ == "__main__":
main()

Resources