Get progress callback in aws boto3 uploads - python-3.x

There's a great question and answer for the original boto uploads here:
How to upload a file to directory in S3 bucket using boto
Which has a callback:
k = Key(bucket)
k.key = 'my test file'
k.set_contents_from_filename(testfile,
cb=percent_cb, num_cb=10)
While I see the boto3 package takes a callback:
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.upload_fileobj
I don't see the equivalent of the num_cb argument. How can I get a progress meter for upload_fileobj using boto3?
s3.upload_fileobj(data, 'mybucket', 'mykey')

If you don't need to limit the number of calling callback, (and there is no way to do it with upload_fileobj),
1. show percentage
import os
import boto3
class Test:
def __init__(self):
self.total = 0
self.uploaded = 0
self.s3 = boto3.client('s3')
def upload_callback(self, size):
if self.total == 0:
return
self.uploaded += size
print("{} %".format(int(self.uploaded / self.total * 100)))
def upload(self, bucket, key, file):
self.total = os.stat(file).st_size
with open(file, 'rb') as data:
self.s3.upload_fileobj(
data, bucket, key, Callback=self.upload_callback)
using progressbar
import os
import boto3
import progressbar
class Test2:
def __init__(self):
self.s3 = boto3.client('s3')
def upload_callback(self, size):
self.pg.update(self.pg.currval + size)
def upload(self, bucket, key, file):
self.pg = progressbar.progressbar.ProgressBar(
maxval=os.stat(file).st_size)
self.pg.start()
with open(file, 'rb') as data:
self.s3.upload_fileobj(
data, bucket, key, Callback=self.upload_callback)

Related

My AWS lambda in python. It downloads a file from S3 and reads that file. Now I need to write a unittest mock test for that

I have an AWS lambda written in python. The lambda downloads a file from S3 to the folder /tmp/records. Then the lambda reads that file. Now I need to write a unit test for that. I need to mock the S3 call. I am wondering how to do that.
Here is my Lambda:
import os
import boto3
s3 = boto3.resource("s3")
def lambda_handler(event, context):
try:
download_records()
with open("/tmp/records/" + "go_message.json.j2") as f:
record = f.read()
except ValueError:
return "record could not be found"
def download_s3_folder(bucket_name, s3_folder, local_dir=None):
bucket = s3.Bucket(bucket_name)
for obj in bucket.objects.filter(Prefix=s3_folder):
target = (
obj.key
if local_dir is None
else os.path.join(local_dir, os.path.relpath(obj.key, s3_folder))
)
if not os.path.exists(os.path.dirname(target)):
os.makedirs(os.path.dirname(target))
if obj.key[-1] == "/":
continue
bucket.download_file(obj.key, target)
return True
def download_records(record=False):
download_s3_folder("sss-records-dev", "subscription", "/tmp/records")
Here is unittest:
import os
import sys
import unittest
from pathlib import Path
import mock # type: ignore
boto3_mock = mock.MagicMock()
sys.modules["boto3"] = boto3_mock
from testing_lambda import ( # noqa: E402 isort:skip
testing_lambda,
)
class TestingLambdaTests(unittest.TestCase):
def _test_message(self):
result = testing_lambda.lambda_handler(None, context="")
def test_package_for_promo(self):
self._test_message()
if __name__ == "__main__":
unittest.main()
I am getting this error when I run unittest:
FileNotFoundError: [Errno 2] No such file or directory: '/tmp/records/go_message.json.j2'

How to get multiple inputs (JSON files for me) in AWS Lambda from the same user's S3 bucket?

I have hereby attached my hardcoded python program which appends two JSON files in the S3 storage to be appended manually. Can someone please tell me how to get multiple input files (JSON files) from the S3 bucket automatically. I know we can do the same in python using *json in the directory of the program but I don't understand how to do the same in AWS Lambda.
Python Code:
import glob
result = []
for f in glob.glob("*.json"):
with open(f, "r") as infile:
result += json.load(infile)
with open("merge.json", "w") as outfile:
json.dump(result, outfile)
For doing the same in lambda I am able to do it for like 2 files, can someone please suggest how to do the same (like taking all JSON files from S3 automatically) in lambda. Thanks in advance.
import boto3
import json
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def lambda_handler(event, context):
object_key = "sample1.json" # replace object key
file_content = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key)["Body"].read()
print(file_content)
object_key2 = "sample2.json" # replace object key
file_content2 = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key2)["Body"].read()
print(file_content2)
result = []
result += json.loads(file_content)
result += json.loads(file_content2)
print(result)
Have followed the syntax from the documentation but I still get the timeout error.
import boto3
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
for page in page_iterator:
print(page['Contents'])
Getting a timeout error:
import boto3
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def iterate_bucket_items(S3_BUCKET):
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=S3_BUCKET)
for page in page_iterator:
if page['KeyCount'] > 0:
for item in page['Contents']:
yield item
for i in iterate_bucket_items(bucket='S3_BUCKET'):
print (i)
Have solved the issue with the help of #JeremyThompson, will attach my final code here:
import json
import boto3
import glob
def lambda_handler(event, context):
s3 = boto3.resource('s3')
bucket = s3.Bucket('bucket-for-json-files')
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
result = []
for page in page_iterator:
result += page['Contents']
s3 = boto3.client('s3')
bucket = 'bucket-for-json-files'
merge = []
lst = []
for i in result:
cmd = i['Key']
print(cmd)
The above code prints the key from each json file available in the user's bucket.

Converting docx to pdf using LibreOffice library on AWS lambda but unable to convert when trying with different font family other than times roman

Converting docx to pdf using LibreOffice library on AWS lambda but unable to convert when trying with different font family other than "times roman" .Please recommend other library instead of installing new fonts for a particular font family
#code -
import os
from io import BytesIO
import tarfile
import boto3
import subprocess
import brotli
libre_office_install_dir = '/tmp/instdir'
def load_libre_office():
if os.path.exists(libre_office_install_dir) and os.path.isdir(libre_office_install_dir):
print('We have a cached copy of LibreOffice, skipping extraction')
else:
print('No cached copy of LibreOffice, extracting tar stream from Brotli file.')
buffer = BytesIO()
with open('/opt/lo.tar.br', 'rb') as brotli_file:
d = brotli.Decompressor()
while True:
chunk = brotli_file.read(1024)
buffer.write(d.decompress(chunk))
if len(chunk) < 1024:
break
buffer.seek(0)
print('Extracting tar stream to /tmp for caching.')
with tarfile.open(fileobj=buffer) as tar:
tar.extractall('/tmp')
print('Done caching LibreOffice!')
return f'{libre_office_install_dir}/program/soffice.bin'
def download_from_s3(bucket, key, download_path):
s3 = boto3.client("s3")
s3.download_file(bucket, key, download_path)
def upload_to_s3(file_path, bucket, key):
s3 = boto3.client("s3")
s3.upload_file(file_path, bucket, key)
def convert_word_to_pdf(soffice_path, word_file_path, output_dir):
conv_cmd = f"{soffice_path} --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to pdf:writer_pdf_Export --outdir {output_dir} {word_file_path}"
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
return False
return True
def lambda_handler(event, context):
bucket = "xxxx"
key = "xxxx/xxxx/xxxx/xxxx/SampleDoc.docx"
key_prefix, base_name = os.path.split(key)
download_path = f"/tmp/{base_name}"
output_dir = "/tmp"
download_from_s3(bucket, key, download_path)
soffice_path = load_libre_office()
is_converted = convert_word_to_pdf(soffice_path, download_path, output_dir)
if is_converted:
file_name, _ = os.path.splitext(base_name)
upload_to_s3(f"{output_dir}/{file_name}.pdf", bucket, f" {key_prefix}/{file_name}.pdf")
return {"response": "file converted to PDF and available at same S3 location of input key"}
else:
return {"response": "cannot convert this document to PDF"}

Uploaded file does not show in S3

I am using boto3 using multipart upload with TransferConfig:
Every thing seems okay as the program runs without errors:
import threading,boto3,re,os,sys
from boto3.s3.transfer import TransferConfig
#create resource
s3=boto3.resource('s3',
region_name = region,
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
aws_session_token=SESSION_TOKEN)
BUCKET_NAME="my_bucket"
# the upload function
def multi_part_upload_with_s3():
# Multipart upload
config = TransferConfig(multipart_threshold=1024*25, max_concurrency=10000,
multipart_chunksize=1024*25, use_threads=True)
#file_path = os.path.dirname(__file__)+'/largefile.pdf'
file_path = "C:/Users/Documents/MyFile.out"
key_path = 'MyDir/MySubDir/'
s3.meta.client.upload_file(file_path, BUCKET_NAME, key_path,
#ExtraArgs={'ACL': 'public-read', 'ContentType':
'text/pdf'},
Config=config, Callback=ProgressPercentage(file_path))
#Not really important, just tells you what percentage of your file has uploaded.
class ProgressPercentage(object):
def __init__(self, filename):
self._filename = filename
self._size = float(os.path.getsize(filename))
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far/self._size)*100
sys.stdout.write("\r%s %s/%s (%.2f%%)" % (self._filename,
self._seen_so_far, self._size, percentage))
sys.stdout.flush()
#Now call fucntion
if __name__=='__main__':
multi_part_upload_with_s3()
Output:
C:/Users/Documents/MyFile.out 1295607/1295607.0 (100.00%)
So it appears to run without errors. However, when I look in S3, 'MySubDir' is created, but 'MyFile.out' is not in there. I thought maybe having the max concurrency in S3 was the culprit thinking it would take a while to join it back up, but I've waited for over 4 hours and nothing has shown up. I've also used other files and other uploading approaches and files would show up in 'MySubDir'.
You never specify the destination file name, only the path. Try:
file_path = "C:/Users/Documents/MyFile.out"
key_path = 'MyDir/MySubDir/MyFile.out'
s3.meta.client.upload_file(file_path, BUCKET_NAME, key_path,
#ExtraArgs={'ACL': 'public-read', 'ContentType':
'text/pdf'},
Config=config,Callback=ProgressPercentage(file_path))

Uploading file to an s3 bucket path longer than 63 characters

I am writing a lambda function to upload a file from one s3 bucket to another, when the former is updated. I am running into an invalid parameter exception when uploading a file to the s3 path, which is longer than 63 characters. Is there a way to get around this?
import boto3
import datetime
import sys
import os
from os import getenv
import json
import csv
REPORT_BUCKET = getenv('REPORT_BUCKET', 'origin-bucket-name')
now = datetime.datetime.now() - datetime.timedelta(days=1)
today = now.strftime("%m/%d/%y")
today_iso = now.strftime('%Y-%m-%d')
def read_attachment(bucket, key):
print(f'Bucket: {bucket}, Key: {key}')
s3 = boto3.resource('s3')
obj = s3.Object(bucket, key)
return obj.get()['Body'].read()
def upload_file(data, new_file, bucket_name):
temp = '/tmp/tmp-{}.csv'.format(today_iso)
with open(temp, 'w', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(data)
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
bucket.delete_objects(
Delete={
'Objects': [
{'Key': new_file},
]
}
)
bucket.upload_file(temp, new_file)
bucket.Object(new_file).Acl().put(ACL='authenticated-read')
os.remove(temp)
print(bucket)
print('Uploaded: %s/%s' % (bucket_name, new_file))
def lambda_handler(event, context):
data = read_attachment(REPORT_BUCKET, f'{today_iso}.csv')
attachment = data.split()
arr = []
arr2 = []
for item in range(len(attachment)):
attachment[item] = attachment[item].decode('utf-8')
arr.append(attachment[item].split(','))
arr2.append(arr[item])
upload_file(arr2, f'{today_iso}.csv', 'accountname-useast1-dl-common-0022-in/sub-
folder/org=inc/f=csv/v=1.0/staging/')
return True
if __name__ == '__main__':
lambda_handler({}, None)
In s3 , the bucketname max size is 63 characters long. (https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html)
In your code you are calling:
upload_file(arr2, f'{today_iso}.csv', 'accountname-useast1-l-common-0022-in/sub-folder/org=inc/f=csv/v=1.0/staging/')
which means that you are passing
accountname-useast1-l-common-0022-in/sub-folder/org=inc/f=csv/v=1.0/staging/'
as the bucketname. This parameter is longer than 63 characters that's why it throws an error.
In order to resolve this pass as bucket name a shorter name and then name whatever you live your actual object.
For example:
bucketname: accountname-useast1-l-common-0022-in
object name: sub-folder/org=inc/f=csv/v=1.0/staging/
so your line of code that needs to be changed is:
upload_file(arr2, /sub-folder/org=inc/f=csv/v=1.0/staging/f'{today_iso}.csv', 'accountname-useast1-dl-common-0022-in')

Resources