Boto3 script to query S3 bucket file dates is slow - python-3.x

I've written a simple script to find the latest file in S3 buckets. It works correctly but is extremely slow. Obviously it has a lot of files to check but if I use something like S3 Browser the file information appears almost immediately. Have I done something wrong or is this just a limitation of bobo3?
#! /usr/bin/python3
import argparse
import boto3
from datetime import datetime
from datetime import timezone
def build_argparser():
parser = argparse.ArgumentParser(description='List S3 buckets by file date.')
parser.add_argument('-p', '--profile', help='Profile to use')
return parser
if __name__ == "__main__":
parser = build_argparser()
args = parser.parse_args()
if args.profile == None:
s3 = boto3.resource('s3')
else:
profile = boto3.session.Session(profile_name=args.profile)
s3 = profile.resource('s3')
for bucket in s3.buckets.all():
print(bucket.name)
latest_key = ""
latest_datetime = datetime
for object in bucket.objects.all():
#print('\t' + str(object.key) + ': ' + str(object.last_modified))
if latest_datetime == datetime or latest_datetime < object.last_modified:
latest_key = object.key
latest_datetime = object.last_modified
print('\t' + str(latest_key) + ': ' + str(latest_datetime))

Related

How to get multiple inputs (JSON files for me) in AWS Lambda from the same user's S3 bucket?

I have hereby attached my hardcoded python program which appends two JSON files in the S3 storage to be appended manually. Can someone please tell me how to get multiple input files (JSON files) from the S3 bucket automatically. I know we can do the same in python using *json in the directory of the program but I don't understand how to do the same in AWS Lambda.
Python Code:
import glob
result = []
for f in glob.glob("*.json"):
with open(f, "r") as infile:
result += json.load(infile)
with open("merge.json", "w") as outfile:
json.dump(result, outfile)
For doing the same in lambda I am able to do it for like 2 files, can someone please suggest how to do the same (like taking all JSON files from S3 automatically) in lambda. Thanks in advance.
import boto3
import json
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def lambda_handler(event, context):
object_key = "sample1.json" # replace object key
file_content = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key)["Body"].read()
print(file_content)
object_key2 = "sample2.json" # replace object key
file_content2 = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key2)["Body"].read()
print(file_content2)
result = []
result += json.loads(file_content)
result += json.loads(file_content2)
print(result)
Have followed the syntax from the documentation but I still get the timeout error.
import boto3
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
for page in page_iterator:
print(page['Contents'])
Getting a timeout error:
import boto3
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def iterate_bucket_items(S3_BUCKET):
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=S3_BUCKET)
for page in page_iterator:
if page['KeyCount'] > 0:
for item in page['Contents']:
yield item
for i in iterate_bucket_items(bucket='S3_BUCKET'):
print (i)
Have solved the issue with the help of #JeremyThompson, will attach my final code here:
import json
import boto3
import glob
def lambda_handler(event, context):
s3 = boto3.resource('s3')
bucket = s3.Bucket('bucket-for-json-files')
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
result = []
for page in page_iterator:
result += page['Contents']
s3 = boto3.client('s3')
bucket = 'bucket-for-json-files'
merge = []
lst = []
for i in result:
cmd = i['Key']
print(cmd)
The above code prints the key from each json file available in the user's bucket.

How to find where an S3 multipart upload is failing in Python?

I am implementing a cron job that will upload a large daily backup file to an S3 Bucket. It works most of the time, but every once in a while, I will check the bucket, and the file size is significantly smaller than the actual size.
It should be roughly 50GB, but the last time it happened, it showed 34GB. My main problem is that I am unsure of what error to try/catch.
I am still learning Python as I go, so bare with me.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
bucket = "my-s3-backup"
s3 = boto3.resource('s3')
# Grabbing the last file, and removing the full path from the string
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
# Multipart upload function
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload))
# Custom Slack notification to inform completion
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
# Custom Slack notification to inform of failure
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
if __name__ == '__main__':
multi_part_upload()
If the script is not "failing," but it's not uploading the complete file size, what exception am I trying to catch here? Should I log output somewhere?
I'm looking through the Botocore Exceptions documentation. I'm just unsure of what to try/catch with this.
For reference, here is the file size difference:
aws s3 ls --summarize --human-readable --recursive s3://my-s3-backup/physical_db_backups/
2022-05-07 14:31:28 50.7 GiB physical_db_backups/xb_202205070101.xb.zst
2022-05-08 12:48:07 50.8 GiB physical_db_backups/xb_202205080101.xb.zst
2022-05-09 01:30:04 34.2 GiB physical_db_backups/xb_202205090101.xb.zst <--- WRONG
Alright, since I was an idiot and didn't realize the file had not completed yet, I made a couple of changes.
I edited the cron to start later.
I have created logic to determine if the backup script is running.
I may incorporate additional checks to make sure the file exists, but for now this is a working POC that has been tested.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
from time import sleep
import psutil
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
import logging
bucket = "fsn-s3-backup"
s3 = boto3.resource('s3')
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
logging.basicConfig(filename='/var/log/s3-backup.log', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode='a')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload),
ExtraArgs={'ContentType': 'application/zstd'})
logger.info("Physical Backup to S3 Complete")
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
logger.error("Physical Backup to S3 Failed: " + error)
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
def checkIfProcessRunning(processName):
for proc in psutil.process_iter():
cmdline = proc.cmdline()
if processName in cmdline:
return True
return False
if __name__ == '__main__':
backuprunning = True
while backuprunning:
logger.info("Checking if backup shell script is running")
if checkIfProcessRunning('/path/to/physical_backup.sh'):
logger.info("Backup shell script still running. Sleeping for 60s")
sleep(60)
else:
backuprunning = False
logger.info("Beginning multipart upload")
multi_part_upload()

Python AWS S3 Download S3 Files save in ZIP

I have a bunch of files stored on AWS S3. I want to download those find into a single zip
Below is my code.
import boto3
import zipfile
from io import StringIO, BytesIO
s3 = boto3.client('s3')
s = BytesIO()
zf = zipfile.ZipFile(s, 'w')
file_name = '%s-files-%s.zip' % (student.get_full_name(), str(datetime.datetime.now()))
files_key_list = ['file1.png', 'file3.png']
for f in files_key_list:
data = s3.download_file(settings.AWS_STORAGE_BUCKET_NAME, f, f)
zf.write(data)
zf.close()
resp = HttpResponse(s.getvalue(), content_type="application/x-zip-compressed")
resp['Content-Disposition'] = 'attachment; filename=%s' % file_name
return resp
Error
stat: can't specify None for path argument
try this
use boto3 get_object
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object
import boto3
import zipfile
from io import StringIO, BytesIO
s3 = boto3.client('s3')
s = BytesIO()
zf = zipfile.ZipFile(s, 'w')
file_name = '%s-files-%s.zip' % (student.get_full_name(), str(datetime.datetime.now()))
files_key_list = ['file1.png', 'file3.png']
for f in files_key_list:
data = s3.get_object(Bucket=settings.AWS_STORAGE_BUCKET_NAME, Key=fpath.file_key)
zf.writestr(fpath.file_name, data.get('Body').read())
zf.close()
resp = HttpResponse(s.getvalue(), content_type="application/x-zip-compressed")
resp['Content-Disposition'] = 'attachment; filename=%s' % file_name
return resp
I had a similar requirement for which the code below satisfies:
ref: zipfile documentation
import sys
import boto3
import zipfile
from io import StringIO, BytesIO
import botocore
import datetime
if len(sys.argv) > 1:
bucket_name=sys.argv[1];
else:
print("Please specify a bucket name to list.")
sys.exit()
s3 = boto3.client('s3')
s3res = boto3.resource('s3')
timestamp=datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
file_name = '%s-files-%s.zip' % (bucket_name, timestamp)
print(f"Saving into zip {file_name}")
zf = zipfile.ZipFile(file_name, 'w')
bucket = s3res.Bucket(bucket_name)
try:
for s3_object in bucket.objects.all():
print("adding ",s3_object)
data = s3.get_object(Bucket=bucket_name, Key=s3_object.key)
zf.writestr(s3_object.key, data.get('Body').read())
except botocore.exceptions.ClientError as resperror:
print ("Error - does bucket exist?", str(resperror))
print ("Please remove possible empty zip: ", file_name)
zf.close()

Uploading file to an s3 bucket path longer than 63 characters

I am writing a lambda function to upload a file from one s3 bucket to another, when the former is updated. I am running into an invalid parameter exception when uploading a file to the s3 path, which is longer than 63 characters. Is there a way to get around this?
import boto3
import datetime
import sys
import os
from os import getenv
import json
import csv
REPORT_BUCKET = getenv('REPORT_BUCKET', 'origin-bucket-name')
now = datetime.datetime.now() - datetime.timedelta(days=1)
today = now.strftime("%m/%d/%y")
today_iso = now.strftime('%Y-%m-%d')
def read_attachment(bucket, key):
print(f'Bucket: {bucket}, Key: {key}')
s3 = boto3.resource('s3')
obj = s3.Object(bucket, key)
return obj.get()['Body'].read()
def upload_file(data, new_file, bucket_name):
temp = '/tmp/tmp-{}.csv'.format(today_iso)
with open(temp, 'w', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(data)
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
bucket.delete_objects(
Delete={
'Objects': [
{'Key': new_file},
]
}
)
bucket.upload_file(temp, new_file)
bucket.Object(new_file).Acl().put(ACL='authenticated-read')
os.remove(temp)
print(bucket)
print('Uploaded: %s/%s' % (bucket_name, new_file))
def lambda_handler(event, context):
data = read_attachment(REPORT_BUCKET, f'{today_iso}.csv')
attachment = data.split()
arr = []
arr2 = []
for item in range(len(attachment)):
attachment[item] = attachment[item].decode('utf-8')
arr.append(attachment[item].split(','))
arr2.append(arr[item])
upload_file(arr2, f'{today_iso}.csv', 'accountname-useast1-dl-common-0022-in/sub-
folder/org=inc/f=csv/v=1.0/staging/')
return True
if __name__ == '__main__':
lambda_handler({}, None)
In s3 , the bucketname max size is 63 characters long. (https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html)
In your code you are calling:
upload_file(arr2, f'{today_iso}.csv', 'accountname-useast1-l-common-0022-in/sub-folder/org=inc/f=csv/v=1.0/staging/')
which means that you are passing
accountname-useast1-l-common-0022-in/sub-folder/org=inc/f=csv/v=1.0/staging/'
as the bucketname. This parameter is longer than 63 characters that's why it throws an error.
In order to resolve this pass as bucket name a shorter name and then name whatever you live your actual object.
For example:
bucketname: accountname-useast1-l-common-0022-in
object name: sub-folder/org=inc/f=csv/v=1.0/staging/
so your line of code that needs to be changed is:
upload_file(arr2, /sub-folder/org=inc/f=csv/v=1.0/staging/f'{today_iso}.csv', 'accountname-useast1-dl-common-0022-in')

how to download files from s3 bucket based on files modified date?

I want to download files from a particular s3 bucket based on files Last modified date.
I have researched on how to connect boto3 and there is plenty of code and documentation available for downloading the file without any conditions. I made a pseudo code
def download_file_s3(bucket_name,modified_date)
# connect to reseource s3
s3 = boto3.resource('s3',aws_access_key_id='demo', aws_secret_access_key='demo')
# connect to the desired bucket
my_bucket = s3.Bucket(bucket_name)
# Get files
for file in my_bucket.objects.all():
I want to complete this function, basically, passing a modified date the function returns the files in the s3 bucket for that particular modified date.
I have a Better solution or a function which could do this automatically. Just pass In the Bucket name and Download path name.
from boto3.session import Session
from datetime import date, timedelta
import boto3
import re
def Download_pdf_specifc_date_subfolder(bucket_name,download_path)
ACCESS_KEY = 'XYZ'
SECRET_KEY = 'ABC'
Bucket_name=bucket_name
# code to create a session
session = Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
bucket = s3.Bucket(Bucket_name)
# code to get the yesterdays date
yesterday = date.today() - timedelta(days=1)
x=yesterday.strftime('20%y-%m-%d')
print(x)
#code to add the files to a list which needs to be downloaded
files_to_downloaded = []
#code to take all the files from s3 under a specific bucket
for fileObject in bucket.objects.all():
file_name = str(fileObject.key)
last_modified=str(fileObject.last_modified)
last_modified=last_modified.split()
if last_modified[0]==x:
# Enter the specific bucketname in the regex in place of Airports to filter only the particluar subfolder
if re.findall(r"Airports/[a-zA-Z]+", file_name):
files_to_downloaded.append(file_name)
# code to Download into a specific Folder
for fileObject in bucket.objects.all():
file_name = str(fileObject.key)
if file_name in files_to_downloaded:
print(file_name)
d_path=download_path + file_name
print(d_path)
bucket.download_file(file_name,d_path)
Download_pdf_specifc_date_subfolder(bucket_name,download_path)
Ultimately the function will give the results in the specific Folder with the files to be downloaded.
Here is my test code and it will print the last_modified datetime of objects which have the datetime after what I set.
import boto3
from datetime import datetime
from datetime import timezone
s3 = boto3.resource('s3')
response = s3.Bucket('<bucket name>').objects.all()
for item in response:
obj = s3.Object(item.bucket_name, item.key)
if obj.last_modified > datetime(2019, 8, 1, 0, 0, 0, tzinfo=timezone.utc):
print(obj.last_modified)
If you have a specific date, then
import boto3
from datetime import datetime, timezone
s3 = boto3.resource('s3')
response = s3.Bucket('<bucket name>').objects.all()
date = '20190827' # input('Insert Date as a form YYYYmmdd')
for item in response:
obj = s3.Object(item.bucket_name, item.key)
if obj.last_modified.strftime('%Y%m%d') == date:
print(obj.last_modified)
will give the results as follows.
2019-08-27 07:13:04+00:00
2019-08-27 07:13:36+00:00
2019-08-27 07:13:39+00:00
If edited this answer to download all files after a certain timestamp and then write the current time to a file for use in the next iteration. You can easily adapt this to only download files of a specific date, month, year, yesterday, etc.
import os
import boto3
import datetime
import pandas as pd
### Load AWS Key, Secret and Region
# ....
###
# Open file to read last download time and update file with current time
latesttime_file = "latest request.txt"
with open(latesttime_file, 'r') as f:
latest_download = pd.to_datetime(f.read(), utc=True)
with open(latesttime_file, 'w') as f:
f.write(str(datetime.datetime.utcnow()))
# Initialize S3-client
s3_client = boto3.client('s3',
region_name=AWS_REGION,
aws_access_key_id=AWS_KEY_ID,
aws_secret_access_key=AWS_SECRET)
def download_dir(prefix, local, bucket, timestamp, client=s3_client):
"""
params:
- prefix: pattern to match in s3
- local: local path to folder in which to place files
- bucket: s3 bucket with target contents
- client: initialized s3 client object
"""
keys = []
dirs = []
next_token = ''
base_kwargs = {
'Bucket':bucket,
'Prefix':prefix,
}
while next_token is not None:
kwargs = base_kwargs.copy()
if next_token != '':
kwargs.update({'ContinuationToken': next_token})
results = client.list_objects_v2(**kwargs)
contents = results.get('Contents')
for i in contents:
k = i.get('Key')
t = i.get('LastModified')
if k[-1] != '/':
if t > timestamp:
keys.append(k)
else:
dirs.append(k)
next_token = results.get('NextContinuationToken')
for d in dirs:
dest_pathname = os.path.join(local, d)
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
for k in keys:
dest_pathname = os.path.join(local, k)
if not os.path.exists(os.path.dirname(dest_pathname)):
os.makedirs(os.path.dirname(dest_pathname))
client.download_file(bucket, k, dest_pathname)
download_dir(<prefix or ''>, <local folder to download to>, <bucketname>, latest_download)

Resources