I am implementing a cron job that will upload a large daily backup file to an S3 Bucket. It works most of the time, but every once in a while, I will check the bucket, and the file size is significantly smaller than the actual size.
It should be roughly 50GB, but the last time it happened, it showed 34GB. My main problem is that I am unsure of what error to try/catch.
I am still learning Python as I go, so bare with me.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
bucket = "my-s3-backup"
s3 = boto3.resource('s3')
# Grabbing the last file, and removing the full path from the string
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
# Multipart upload function
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload))
# Custom Slack notification to inform completion
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
# Custom Slack notification to inform of failure
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
if __name__ == '__main__':
multi_part_upload()
If the script is not "failing," but it's not uploading the complete file size, what exception am I trying to catch here? Should I log output somewhere?
I'm looking through the Botocore Exceptions documentation. I'm just unsure of what to try/catch with this.
For reference, here is the file size difference:
aws s3 ls --summarize --human-readable --recursive s3://my-s3-backup/physical_db_backups/
2022-05-07 14:31:28 50.7 GiB physical_db_backups/xb_202205070101.xb.zst
2022-05-08 12:48:07 50.8 GiB physical_db_backups/xb_202205080101.xb.zst
2022-05-09 01:30:04 34.2 GiB physical_db_backups/xb_202205090101.xb.zst <--- WRONG
Alright, since I was an idiot and didn't realize the file had not completed yet, I made a couple of changes.
I edited the cron to start later.
I have created logic to determine if the backup script is running.
I may incorporate additional checks to make sure the file exists, but for now this is a working POC that has been tested.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
from time import sleep
import psutil
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
import logging
bucket = "fsn-s3-backup"
s3 = boto3.resource('s3')
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
logging.basicConfig(filename='/var/log/s3-backup.log', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode='a')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload),
ExtraArgs={'ContentType': 'application/zstd'})
logger.info("Physical Backup to S3 Complete")
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
logger.error("Physical Backup to S3 Failed: " + error)
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
def checkIfProcessRunning(processName):
for proc in psutil.process_iter():
cmdline = proc.cmdline()
if processName in cmdline:
return True
return False
if __name__ == '__main__':
backuprunning = True
while backuprunning:
logger.info("Checking if backup shell script is running")
if checkIfProcessRunning('/path/to/physical_backup.sh'):
logger.info("Backup shell script still running. Sleeping for 60s")
sleep(60)
else:
backuprunning = False
logger.info("Beginning multipart upload")
multi_part_upload()
Related
I am trying to read a pdf in AWS lambda. The pdf is stored in an s3 bucket. I need to extract the text from pdf and translate them into any required language. I am able to run my code in my notebook but when I run it on Lambda I get this error message in my cloudwatch logs - task timed out after 3.01 seconds.
import fitz
import base64
from io import BytesIO
from PIL import Image
import boto3
def lambda_handler(event, context):
s3 = boto3.client('s3')
client_textract = boto3.client('textract')
translate_client = boto3.client('translate')
try:
print("Inside handler")
s3_bucket = "my_bucket"
pdf_file_name = 'sample.pdf'
pdf_file = s3.get_object(Bucket=s3_bucket, Key=pdf_file_name)
file_content = pdf_file['Body'].read()
print("Before reading ")
with fitz.open(stream=file_content, filetype="pdf") as doc:
Try to extend the timeout, which by default is set at 3 sec.
If that does not help, try to increase the allocated memory.
Also, you may consider pushing
s3 = boto3.client('s3')
client_textract = boto3.client('textract')
translate_client = boto3.client('translate')
out of your handler. Put it right after the imports. The function will run more efficiently on frequent invocation.
I have an AWS lambda written in python. The lambda downloads a file from S3 to the folder /tmp/records. Then the lambda reads that file. Now I need to write a unit test for that. I need to mock the S3 call. I am wondering how to do that.
Here is my Lambda:
import os
import boto3
s3 = boto3.resource("s3")
def lambda_handler(event, context):
try:
download_records()
with open("/tmp/records/" + "go_message.json.j2") as f:
record = f.read()
except ValueError:
return "record could not be found"
def download_s3_folder(bucket_name, s3_folder, local_dir=None):
bucket = s3.Bucket(bucket_name)
for obj in bucket.objects.filter(Prefix=s3_folder):
target = (
obj.key
if local_dir is None
else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
)
if not os.path.exists(os.path.dirname(target)):
os.makedirs(os.path.dirname(target))
if obj.key[-1] == "/":
continue
bucket.download_file(obj.key, target)
return True
def download_records(record=False):
download_s3_folder("sss-records-dev", "subscription", "/tmp/records")
Here is unittest:
import os
import sys
import unittest
from pathlib import Path
import mock # type: ignore
boto3_mock = mock.MagicMock()
sys.modules["boto3"] = boto3_mock
from testing_lambda import ( # noqa: E402 isort:skip
testing_lambda,
)
class TestingLambdaTests(unittest.TestCase):
def _test_message(self):
result = testing_lambda.lambda_handler(None, context="")
def test_package_for_promo(self):
self._test_message()
if __name__ == "__main__":
unittest.main()
I am getting this error when I run unittest:
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/records/go_message.json.j2'
I've a requirement wherein i have to call a python file stored in GCP Storage bucket, from the driver python file.
Here is the code(being run on my local m/c, but eventually will be scheduled using Airflow on GCP):
from google.cloud import storage
import os
jsonkey = 'googl-cloudstorage-key.json'
storage_client = storage.Client.from_service_account_json(jsonkey)
def download_file_from_bucket(blob_name, file_path, bucket_name):
print(f" download_file_from_bucket : blob_name file_path : {file_path}")
try:
bucket = storage_client.get_bucket(bucket_name)
print(f" bucket name : {bucket.name}")
blob = bucket.blob(blob_name)
print(f" blob.name : {blob.name}, length of blob : {blob.download_as_string} ")
with open(file_path, "w") as f:
blob.download_blob_to_file(blob, f)
download_file_from_bucket('file1.py', os.path.join(os.getcwd(),'new_file.py'),'kk_bucket_1')
The file gets downloaded into new_file.py, however the file downloaded is blank
Here is the content of the file uploaded on GCP Storage bucket :
name : file1.py
import sys
print("file1.py, customer is ", sys.argv[1])
What do i need to do, to make this work ?
Pls note :
I need to call this file from the driver file, which i plan to do using the subprocess method
Here is the code for that :
import subprocess, os
cust = ['cust1', 'cust2']
for c in cust:
print(" c -> ", c)
fileToCall = os.path.join(os.getcwd(), 'file1_1.py')
print(" file to call ", fileToCall)
subprocess.run(['python', fileToCall ,c])
I was able to get this to work using the following code :
def download_file_from_bucket(blob_name, file_path, bucket_name):
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.download_to_filename(file_path)
print("after writing to file")
I am using boto3 using multipart upload with TransferConfig:
Every thing seems okay as the program runs without errors:
import threading,boto3,re,os,sys
from boto3.s3.transfer import TransferConfig
#create resource
s3=boto3.resource('s3',
region_name = region,
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
aws_session_token=SESSION_TOKEN)
BUCKET_NAME="my_bucket"
# the upload function
def multi_part_upload_with_s3():
# Multipart upload
config = TransferConfig(multipart_threshold=1024*25, max_concurrency=10000,
multipart_chunksize=1024*25, use_threads=True)
#file_path = os.path.dirname(__file__)+'/largefile.pdf'
file_path = "C:/Users/Documents/MyFile.out"
key_path = 'MyDir/MySubDir/'
s3.meta.client.upload_file(file_path, BUCKET_NAME, key_path,
#ExtraArgs={'ACL': 'public-read', 'ContentType':
'text/pdf'},
Config=config, Callback=ProgressPercentage(file_path))
#Not really important, just tells you what percentage of your file has uploaded.
class ProgressPercentage(object):
def __init__(self, filename):
self._filename = filename
self._size = float(os.path.getsize(filename))
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far/self._size)*100
sys.stdout.write("\r%s %s/%s (%.2f%%)" % (self._filename,
self._seen_so_far, self._size, percentage))
sys.stdout.flush()
#Now call fucntion
if __name__=='__main__':
multi_part_upload_with_s3()
Output:
C:/Users/Documents/MyFile.out 1295607/1295607.0 (100.00%)
So it appears to run without errors. However, when I look in S3, 'MySubDir' is created, but 'MyFile.out' is not in there. I thought maybe having the max concurrency in S3 was the culprit thinking it would take a while to join it back up, but I've waited for over 4 hours and nothing has shown up. I've also used other files and other uploading approaches and files would show up in 'MySubDir'.
You never specify the destination file name, only the path. Try:
file_path = "C:/Users/Documents/MyFile.out"
key_path = 'MyDir/MySubDir/MyFile.out'
s3.meta.client.upload_file(file_path, BUCKET_NAME, key_path,
#ExtraArgs={'ACL': 'public-read', 'ContentType':
'text/pdf'},
Config=config,Callback=ProgressPercentage(file_path))
I've written a simple script to find the latest file in S3 buckets. It works correctly but is extremely slow. Obviously it has a lot of files to check but if I use something like S3 Browser the file information appears almost immediately. Have I done something wrong or is this just a limitation of bobo3?
#! /usr/bin/python3
import argparse
import boto3
from datetime import datetime
from datetime import timezone
def build_argparser():
parser = argparse.ArgumentParser(description='List S3 buckets by file date.')
parser.add_argument('-p', '--profile', help='Profile to use')
return parser
if __name__ == "__main__":
parser = build_argparser()
args = parser.parse_args()
if args.profile == None:
s3 = boto3.resource('s3')
else:
profile = boto3.session.Session(profile_name=args.profile)
s3 = profile.resource('s3')
for bucket in s3.buckets.all():
print(bucket.name)
latest_key = ""
latest_datetime = datetime
for object in bucket.objects.all():
#print('\t' + str(object.key) + ': ' + str(object.last_modified))
if latest_datetime == datetime or latest_datetime < object.last_modified:
latest_key = object.key
latest_datetime = object.last_modified
print('\t' + str(latest_key) + ': ' + str(latest_datetime))