Python AWS S3 Download S3 Files save in ZIP - python-3.x

I have a bunch of files stored on AWS S3. I want to download those find into a single zip
Below is my code.
import boto3
import zipfile
from io import StringIO, BytesIO
s3 = boto3.client('s3')
s = BytesIO()
zf = zipfile.ZipFile(s, 'w')
file_name = '%s-files-%s.zip' % (student.get_full_name(), str(datetime.datetime.now()))
files_key_list = ['file1.png', 'file3.png']
for f in files_key_list:
data = s3.download_file(settings.AWS_STORAGE_BUCKET_NAME, f, f)
zf.write(data)
zf.close()
resp = HttpResponse(s.getvalue(), content_type="application/x-zip-compressed")
resp['Content-Disposition'] = 'attachment; filename=%s' % file_name
return resp
Error
stat: can't specify None for path argument

try this
use boto3 get_object
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.get_object
import boto3
import zipfile
from io import StringIO, BytesIO
s3 = boto3.client('s3')
s = BytesIO()
zf = zipfile.ZipFile(s, 'w')
file_name = '%s-files-%s.zip' % (student.get_full_name(), str(datetime.datetime.now()))
files_key_list = ['file1.png', 'file3.png']
for f in files_key_list:
data = s3.get_object(Bucket=settings.AWS_STORAGE_BUCKET_NAME, Key=fpath.file_key)
zf.writestr(fpath.file_name, data.get('Body').read())
zf.close()
resp = HttpResponse(s.getvalue(), content_type="application/x-zip-compressed")
resp['Content-Disposition'] = 'attachment; filename=%s' % file_name
return resp

I had a similar requirement for which the code below satisfies:
ref: zipfile documentation
import sys
import boto3
import zipfile
from io import StringIO, BytesIO
import botocore
import datetime
if len(sys.argv) > 1:
bucket_name=sys.argv[1];
else:
print("Please specify a bucket name to list.")
sys.exit()
s3 = boto3.client('s3')
s3res = boto3.resource('s3')
timestamp=datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
file_name = '%s-files-%s.zip' % (bucket_name, timestamp)
print(f"Saving into zip {file_name}")
zf = zipfile.ZipFile(file_name, 'w')
bucket = s3res.Bucket(bucket_name)
try:
for s3_object in bucket.objects.all():
print("adding ",s3_object)
data = s3.get_object(Bucket=bucket_name, Key=s3_object.key)
zf.writestr(s3_object.key, data.get('Body').read())
except botocore.exceptions.ClientError as resperror:
print ("Error - does bucket exist?", str(resperror))
print ("Please remove possible empty zip: ", file_name)
zf.close()

Related

How to get multiple inputs (JSON files for me) in AWS Lambda from the same user's S3 bucket?

I have hereby attached my hardcoded python program which appends two JSON files in the S3 storage to be appended manually. Can someone please tell me how to get multiple input files (JSON files) from the S3 bucket automatically. I know we can do the same in python using *json in the directory of the program but I don't understand how to do the same in AWS Lambda.
Python Code:
import glob
result = []
for f in glob.glob("*.json"):
with open(f, "r") as infile:
result += json.load(infile)
with open("merge.json", "w") as outfile:
json.dump(result, outfile)
For doing the same in lambda I am able to do it for like 2 files, can someone please suggest how to do the same (like taking all JSON files from S3 automatically) in lambda. Thanks in advance.
import boto3
import json
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def lambda_handler(event, context):
object_key = "sample1.json" # replace object key
file_content = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key)["Body"].read()
print(file_content)
object_key2 = "sample2.json" # replace object key
file_content2 = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key2)["Body"].read()
print(file_content2)
result = []
result += json.loads(file_content)
result += json.loads(file_content2)
print(result)
Have followed the syntax from the documentation but I still get the timeout error.
import boto3
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
for page in page_iterator:
print(page['Contents'])
Getting a timeout error:
import boto3
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def iterate_bucket_items(S3_BUCKET):
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=S3_BUCKET)
for page in page_iterator:
if page['KeyCount'] > 0:
for item in page['Contents']:
yield item
for i in iterate_bucket_items(bucket='S3_BUCKET'):
print (i)
Have solved the issue with the help of #JeremyThompson, will attach my final code here:
import json
import boto3
import glob
def lambda_handler(event, context):
s3 = boto3.resource('s3')
bucket = s3.Bucket('bucket-for-json-files')
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
result = []
for page in page_iterator:
result += page['Contents']
s3 = boto3.client('s3')
bucket = 'bucket-for-json-files'
merge = []
lst = []
for i in result:
cmd = i['Key']
print(cmd)
The above code prints the key from each json file available in the user's bucket.

Converting docx to pdf using LibreOffice library on AWS lambda but unable to convert when trying with different font family other than times roman

Converting docx to pdf using LibreOffice library on AWS lambda but unable to convert when trying with different font family other than "times roman" .Please recommend other library instead of installing new fonts for a particular font family
#code -
import os
from io import BytesIO
import tarfile
import boto3
import subprocess
import brotli
libre_office_install_dir = '/tmp/instdir'
def load_libre_office():
if os.path.exists(libre_office_install_dir) and os.path.isdir(libre_office_install_dir):
print('We have a cached copy of LibreOffice, skipping extraction')
else:
print('No cached copy of LibreOffice, extracting tar stream from Brotli file.')
buffer = BytesIO()
with open('/opt/lo.tar.br', 'rb') as brotli_file:
d = brotli.Decompressor()
while True:
chunk = brotli_file.read(1024)
buffer.write(d.decompress(chunk))
if len(chunk) < 1024:
break
buffer.seek(0)
print('Extracting tar stream to /tmp for caching.')
with tarfile.open(fileobj=buffer) as tar:
tar.extractall('/tmp')
print('Done caching LibreOffice!')
return f'{libre_office_install_dir}/program/soffice.bin'
def download_from_s3(bucket, key, download_path):
s3 = boto3.client("s3")
s3.download_file(bucket, key, download_path)
def upload_to_s3(file_path, bucket, key):
s3 = boto3.client("s3")
s3.upload_file(file_path, bucket, key)
def convert_word_to_pdf(soffice_path, word_file_path, output_dir):
conv_cmd = f"{soffice_path} --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to pdf:writer_pdf_Export --outdir {output_dir} {word_file_path}"
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
return False
return True
def lambda_handler(event, context):
bucket = "xxxx"
key = "xxxx/xxxx/xxxx/xxxx/SampleDoc.docx"
key_prefix, base_name = os.path.split(key)
download_path = f"/tmp/{base_name}"
output_dir = "/tmp"
download_from_s3(bucket, key, download_path)
soffice_path = load_libre_office()
is_converted = convert_word_to_pdf(soffice_path, download_path, output_dir)
if is_converted:
file_name, _ = os.path.splitext(base_name)
upload_to_s3(f"{output_dir}/{file_name}.pdf", bucket, f" {key_prefix}/{file_name}.pdf")
return {"response": "file converted to PDF and available at same S3 location of input key"}
else:
return {"response": "cannot convert this document to PDF"}

How to Create a tar file containing all the files in a directory

I have been trying to figure out how to generate a tar file of a directory of files. I have this code
tar = tarfile.open('/tmp/' + newDate + '.tar', 'w')
for fname in get_matching_s3_keys(bucket=agtBucket, prefix=key, suffix='.log'):
print(fname)
file_obj = s3object.Object(agtBucket, fname)
file_content = file_obj.get()['Body'].read()
tar.add(file_content)
tar.close()
But I get this error when I try to add file_content to tar
"errorMessage": "a bytes-like object is required, not 'str'"
I hope someone can please help me correct what I have wrong.
Here is answer :
import boto3
import tarfile
import os.path
s3Client = boto3.client('s3')
s3object = boto3.resource('s3')
def lambda_handler(event, context):
agtBucket = "angularbuildbucket"
key=""
tar = tarfile.open('/tmp/example.tar', 'w')
source_dir="/tmp/"
for fname in get_matching_s3_keys(bucket=agtBucket, prefix=key, suffix='.log'):
print(fname)
file_obj = s3object.Object(agtBucket, fname)
#file_content = file_obj.get()['Body'].read()
#tar.add(file_content)
s3object.Bucket(agtBucket).download_file(fname, '/tmp/'+fname)
tar.add(source_dir, arcname=os.path.basename(source_dir))
tar.close()
s3object.meta.client.upload_file(source_dir+"example.tar", agtBucket, 'example.tar')
def get_matching_s3_keys(bucket, prefix='', suffix=''):
"""
Generate the keys in an S3 bucket.
:param bucket: Name of the S3 bucket.
:param prefix: Only fetch keys that start with this prefix (optional).
:param suffix: Only fetch keys that end with this suffix (optional).
"""
kwargs = {'Bucket': bucket, 'Prefix': prefix}
while True:
resp = s3Client.list_objects_v2(**kwargs)
for obj in resp['Contents']:
key = obj['Key']
if key.endswith(suffix):
yield key
try:
kwargs['ContinuationToken'] = resp['NextContinuationToken']
except KeyError:
break
You are getting this error because tar.add() expect object as input to add into buffer for compressing objects. But while doing file_obj.get()['Body'].read() getting content of file in string format.

Uploading file to an s3 bucket path longer than 63 characters

I am writing a lambda function to upload a file from one s3 bucket to another, when the former is updated. I am running into an invalid parameter exception when uploading a file to the s3 path, which is longer than 63 characters. Is there a way to get around this?
import boto3
import datetime
import sys
import os
from os import getenv
import json
import csv
REPORT_BUCKET = getenv('REPORT_BUCKET', 'origin-bucket-name')
now = datetime.datetime.now() - datetime.timedelta(days=1)
today = now.strftime("%m/%d/%y")
today_iso = now.strftime('%Y-%m-%d')
def read_attachment(bucket, key):
print(f'Bucket: {bucket}, Key: {key}')
s3 = boto3.resource('s3')
obj = s3.Object(bucket, key)
return obj.get()['Body'].read()
def upload_file(data, new_file, bucket_name):
temp = '/tmp/tmp-{}.csv'.format(today_iso)
with open(temp, 'w', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(data)
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
bucket.delete_objects(
Delete={
'Objects': [
{'Key': new_file},
]
}
)
bucket.upload_file(temp, new_file)
bucket.Object(new_file).Acl().put(ACL='authenticated-read')
os.remove(temp)
print(bucket)
print('Uploaded: %s/%s' % (bucket_name, new_file))
def lambda_handler(event, context):
data = read_attachment(REPORT_BUCKET, f'{today_iso}.csv')
attachment = data.split()
arr = []
arr2 = []
for item in range(len(attachment)):
attachment[item] = attachment[item].decode('utf-8')
arr.append(attachment[item].split(','))
arr2.append(arr[item])
upload_file(arr2, f'{today_iso}.csv', 'accountname-useast1-dl-common-0022-in/sub-
folder/org=inc/f=csv/v=1.0/staging/')
return True
if __name__ == '__main__':
lambda_handler({}, None)
In s3 , the bucketname max size is 63 characters long. (https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html)
In your code you are calling:
upload_file(arr2, f'{today_iso}.csv', 'accountname-useast1-l-common-0022-in/sub-folder/org=inc/f=csv/v=1.0/staging/')
which means that you are passing
accountname-useast1-l-common-0022-in/sub-folder/org=inc/f=csv/v=1.0/staging/'
as the bucketname. This parameter is longer than 63 characters that's why it throws an error.
In order to resolve this pass as bucket name a shorter name and then name whatever you live your actual object.
For example:
bucketname: accountname-useast1-l-common-0022-in
object name: sub-folder/org=inc/f=csv/v=1.0/staging/
so your line of code that needs to be changed is:
upload_file(arr2, /sub-folder/org=inc/f=csv/v=1.0/staging/f'{today_iso}.csv', 'accountname-useast1-dl-common-0022-in')

Boto3 script to query S3 bucket file dates is slow

I've written a simple script to find the latest file in S3 buckets. It works correctly but is extremely slow. Obviously it has a lot of files to check but if I use something like S3 Browser the file information appears almost immediately. Have I done something wrong or is this just a limitation of bobo3?
#! /usr/bin/python3
import argparse
import boto3
from datetime import datetime
from datetime import timezone
def build_argparser():
parser = argparse.ArgumentParser(description='List S3 buckets by file date.')
parser.add_argument('-p', '--profile', help='Profile to use')
return parser
if __name__ == "__main__":
parser = build_argparser()
args = parser.parse_args()
if args.profile == None:
s3 = boto3.resource('s3')
else:
profile = boto3.session.Session(profile_name=args.profile)
s3 = profile.resource('s3')
for bucket in s3.buckets.all():
print(bucket.name)
latest_key = ""
latest_datetime = datetime
for object in bucket.objects.all():
#print('\t' + str(object.key) + ': ' + str(object.last_modified))
if latest_datetime == datetime or latest_datetime < object.last_modified:
latest_key = object.key
latest_datetime = object.last_modified
print('\t' + str(latest_key) + ': ' + str(latest_datetime))

Resources