Reading a pdf in AWS lambda using PyMuPDF - python-3.x

I am trying to read a pdf in AWS lambda. The pdf is stored in an s3 bucket. I need to extract the text from pdf and translate them into any required language. I am able to run my code in my notebook but when I run it on Lambda I get this error message in my cloudwatch logs - task timed out after 3.01 seconds.
import fitz
import base64
from io import BytesIO
from PIL import Image
import boto3
def lambda_handler(event, context):
s3 = boto3.client('s3')
client_textract = boto3.client('textract')
translate_client = boto3.client('translate')
try:
print("Inside handler")
s3_bucket = "my_bucket"
pdf_file_name = 'sample.pdf'
pdf_file = s3.get_object(Bucket=s3_bucket, Key=pdf_file_name)
file_content = pdf_file['Body'].read()
print("Before reading ")
with fitz.open(stream=file_content, filetype="pdf") as doc:

Try to extend the timeout, which by default is set at 3 sec.
If that does not help, try to increase the allocated memory.
Also, you may consider pushing
s3 = boto3.client('s3')
client_textract = boto3.client('textract')
translate_client = boto3.client('translate')
out of your handler. Put it right after the imports. The function will run more efficiently on frequent invocation.

Related

How to get multiple inputs (JSON files for me) in AWS Lambda from the same user's S3 bucket?

I have hereby attached my hardcoded python program which appends two JSON files in the S3 storage to be appended manually. Can someone please tell me how to get multiple input files (JSON files) from the S3 bucket automatically. I know we can do the same in python using *json in the directory of the program but I don't understand how to do the same in AWS Lambda.
Python Code:
import glob
result = []
for f in glob.glob("*.json"):
with open(f, "r") as infile:
result += json.load(infile)
with open("merge.json", "w") as outfile:
json.dump(result, outfile)
For doing the same in lambda I am able to do it for like 2 files, can someone please suggest how to do the same (like taking all JSON files from S3 automatically) in lambda. Thanks in advance.
import boto3
import json
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def lambda_handler(event, context):
object_key = "sample1.json" # replace object key
file_content = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key)["Body"].read()
print(file_content)
object_key2 = "sample2.json" # replace object key
file_content2 = s3_client.get_object(Bucket=S3_BUCKET, Key=object_key2)["Body"].read()
print(file_content2)
result = []
result += json.loads(file_content)
result += json.loads(file_content2)
print(result)
Have followed the syntax from the documentation but I still get the timeout error.
import boto3
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
for page in page_iterator:
print(page['Contents'])
Getting a timeout error:
import boto3
s3_client = boto3.client("s3")
S3_BUCKET = 'bucket-for-json-files'
def iterate_bucket_items(S3_BUCKET):
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=S3_BUCKET)
for page in page_iterator:
if page['KeyCount'] > 0:
for item in page['Contents']:
yield item
for i in iterate_bucket_items(bucket='S3_BUCKET'):
print (i)
Have solved the issue with the help of #JeremyThompson, will attach my final code here:
import json
import boto3
import glob
def lambda_handler(event, context):
s3 = boto3.resource('s3')
bucket = s3.Bucket('bucket-for-json-files')
# Create a client
client = boto3.client('s3', region_name='us-east-1')
# Create a reusable Paginator
paginator = client.get_paginator('list_objects')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket='bucket-for-json-files')
result = []
for page in page_iterator:
result += page['Contents']
s3 = boto3.client('s3')
bucket = 'bucket-for-json-files'
merge = []
lst = []
for i in result:
cmd = i['Key']
print(cmd)
The above code prints the key from each json file available in the user's bucket.

How to find where an S3 multipart upload is failing in Python?

I am implementing a cron job that will upload a large daily backup file to an S3 Bucket. It works most of the time, but every once in a while, I will check the bucket, and the file size is significantly smaller than the actual size.
It should be roughly 50GB, but the last time it happened, it showed 34GB. My main problem is that I am unsure of what error to try/catch.
I am still learning Python as I go, so bare with me.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
bucket = "my-s3-backup"
s3 = boto3.resource('s3')
# Grabbing the last file, and removing the full path from the string
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
# Multipart upload function
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload))
# Custom Slack notification to inform completion
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
# Custom Slack notification to inform of failure
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
if __name__ == '__main__':
multi_part_upload()
If the script is not "failing," but it's not uploading the complete file size, what exception am I trying to catch here? Should I log output somewhere?
I'm looking through the Botocore Exceptions documentation. I'm just unsure of what to try/catch with this.
For reference, here is the file size difference:
aws s3 ls --summarize --human-readable --recursive s3://my-s3-backup/physical_db_backups/
2022-05-07 14:31:28 50.7 GiB physical_db_backups/xb_202205070101.xb.zst
2022-05-08 12:48:07 50.8 GiB physical_db_backups/xb_202205080101.xb.zst
2022-05-09 01:30:04 34.2 GiB physical_db_backups/xb_202205090101.xb.zst <--- WRONG
Alright, since I was an idiot and didn't realize the file had not completed yet, I made a couple of changes.
I edited the cron to start later.
I have created logic to determine if the backup script is running.
I may incorporate additional checks to make sure the file exists, but for now this is a working POC that has been tested.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
from time import sleep
import psutil
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
import logging
bucket = "fsn-s3-backup"
s3 = boto3.resource('s3')
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
logging.basicConfig(filename='/var/log/s3-backup.log', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode='a')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload),
ExtraArgs={'ContentType': 'application/zstd'})
logger.info("Physical Backup to S3 Complete")
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
logger.error("Physical Backup to S3 Failed: " + error)
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
def checkIfProcessRunning(processName):
for proc in psutil.process_iter():
cmdline = proc.cmdline()
if processName in cmdline:
return True
return False
if __name__ == '__main__':
backuprunning = True
while backuprunning:
logger.info("Checking if backup shell script is running")
if checkIfProcessRunning('/path/to/physical_backup.sh'):
logger.info("Backup shell script still running. Sleeping for 60s")
sleep(60)
else:
backuprunning = False
logger.info("Beginning multipart upload")
multi_part_upload()

Difficulty in executing boto3 S3 copy function using aws lambda

Here is the scenario. I have an S3 bucket (e.g. daily-data-input) where daily files will be written to a specific folder (e.g. S3://daily-data-input/data/test/). Whenever a file is written under the "test" folder a copy should also be written to the "test_copy" folder in the same bucket. If "test_copy" is not existing, it should be created.
I have used S3 event notification and attached it to a lambda function(with python 3.7) which will check if the "test_copy" key is existing if not will be created. I am able to create the "test_copy" folder successfully and couldn't make the S3 copy via boto3 to be working.
Here is the code for your reference:
import boto3
import os
import botocore
s3 = boto3.resource('s3')
s3_cli=boto3.client('s3')
def lambda_handler(event, context):
bucket_name = event ['Records'][0]['s3']['bucket']['name']
bucket_key = event['Records'][0]['s3']['object']['key']
file = (os.path.basename(bucket_key))
source_key_path = (os.path.dirname(bucket_key))
target_keypath = source_key_path+'_'+'copy'+'/'
target_bucket_key = target_keypath+file
copy_source = {'Bucket': bucket_name, 'Key': bucket_key}
try:
s3.Object(bucket_name, target_keypath).load()
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == "404":
# Create the key
print ("Creating target _copy folder")
s3_cli.put_object(Bucket=bucket_name,Key=target_keypath)
#copy the file
#s3.copy_object(Bucket=bucket_name, Key=target_bucket_key, CopySource=copy_source)
else:
print ("Something went wrong!!")
else:
print ("Key exists!!")
# s3.copy_object(Bucket=bucket_name, Key=target_bucket_key, CopySource=copy_source)
I tried s3.copy_object, s3_cli.meta.client.copy, bucket.copy() and none of them are working. Please let me know if i am doing something wrong.
Here is one simple way to copy an object in S3 within a bucket:
import boto3
s3 = boto3.resource('s3')
bucket = 'mybucket'
src_key = 'data/test/cat.png'
dest_key = 'data/test_copy/cat.png'
s3.Object(bucket, dest_key).copy_from(CopySource=f'{bucket}/{src_key}')
Here is another, lower-level way to do the same thing:
import boto3
s3 = boto3.client('s3')
bucket = 'mybucket'
src_key = 'data/test/cat.png'
dest_key = 'data/test_copy/cat.png'
s3client.copy_object(Bucket=bucket, CopySource={'Bucket':bucket,'Key':src_key}, Key=dest_key)

How can I use a Lambda function to call a Glue function (ETL) when a text file is loaded to an S3 bucket

I am trying to set up a lambda function that activates a Glue function when a .txt file is uploaded to an S3 bucket, I am using python 3.7
So far I have this:
from __future__ import print_function
import json
import boto3
import urllib
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context): # handler
source_bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.quote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8'))
try:
# what to put here
except Exception as e:
print(e)
print('Error')
raise e
But I don't understand how can I call the glue function
I manage to do it like this:
from __future__ import print_function
import json
import boto3
client = boto3.client('glue')
def lambda_handler(event, context):
response = client.start_job_run(JobName = 'GLUE_CODE_NAME')
Later I will post the S3 event
You can configure an S3 Event Notification that will trigger this Lambda function when PUT object actions is called on an S3 prefix.
https://docs.aws.amazon.com/AmazonS3/latest/user-guide/enable-event-notifications.html
This lambda function can then trigger the StartJobRun action of Glue API.
https://docs.aws.amazon.com/glue/latest/webapi/API_StartJobRun.html

Read an video file object from S3 and use it for further processing through Opencv

import boto3
import cv2
import numpy as np
s3 = boto3.resource('s3')
vid = (s3.Object('bucketname', 'video.blob').get()['Body'].read())
cap = cv2.VideoCapture(vid)
This is my code. I have a video file in an s3 bucket. I want to do some processing on it with OpenCV and I don't want to download it. So I'm trying to store that video file into vid. Now the problem is that type(vid) is byte which is the reason to result in this error TypeError: an integer is required (got type bytes)
on line 6. I tried converting it into an integer or a string but was unable to.
On an attempt to convert byte to an integer: I referred to this and was getting length issues. This is just a sample video file. The actual file I want to do processing on will be huge when converted to byte object.
On an attempt to get the object as a string and then convert it to an integer: I referred to this. Even this doesn't seem to work for me.
If anyone can help me solve this issue, I will be grateful. Please comment if anything is uncertain to you regarding my issue and I'll try to provide more details.
If streaming the video from a url is an acceptable solution, I think that is the easiest solution. You just need to generate a url to read the video from.
import boto3
import cv2
s3_client = boto3.client('s3')
bucket = 'bucketname'
key = 'video.blob'
url = s3_client.generate_presigned_url('get_object',
Params = {'Bucket': bucket, 'Key': key},
ExpiresIn = 600) #this url will be available for 600 seconds
cap = cv2.VideoCapture(url)
ret, frame = cap.read()
You should see that you are able to read and process frames from that url.
Refer below the useful code snippet to perform various operations on S3 bucket.
import boto3
s3 = boto3.resource('s3', region_name='us-east-2')
for listing buckets in s3
for bucket in s3.buckets.all():
print(bucket.name)
bucket creation in s3
my_bucket=s3.create_bucket(Bucket='Bucket Name', CreateBucketConfiguration={
'LocationConstraint': 'us-east-2'
})
listing down objects inside bucket
my_bucket = s3.Bucket('Bucket Name')
for file in my_bucket.objects.all():
print (file.key)
Uploading a file from current directory
import os
print(os.getcwd())
fileName="B01.jpg"
bucketName="Bucket Name"
file = open(fileName)
s3.meta.client.upload_file(fileName, bucketName, 'test2.txt')
reading image/video from bucket
import matplotlib.pyplot as plt
s3 = boto3.resource('s3', region_name='us-east-2')
bucket = s3.Bucket('Bucket Name') # bucket name
object = bucket.Object('maisie_williams.jpg') # image name
object.download_file('B01.jpg') #donwload image with this name
img=plt.imread('B01.jpg') #read the downloaded image
imgplot = plt.imshow(img) #plot the image
plt.show(imgplot)
Reading from one bucket and then dumping it to another
import boto3
s3 = boto3.resource('s3', region_name='us-east-2')
bucket = s3.Bucket('Bucket Name') # bucket name
object = bucket.Object('maisie_williams.jpg') # image name
object.download_file('B01.jpg')
fileName="B01.jpg"
bucketName="Bucket Name"
file = open(fileName)
s3.meta.client.upload_file(fileName, bucketName, 'testz.jpg')
If you have access keys then you can probably do the folowing
keys = pd.read_csv('accessKeys.csv')
#creating Session for S3 buckets
session = boto3.session.Session(aws_access_key_id=keys['Access key ID'][0],
aws_secret_access_key=keys['Secret access key'][0])
s3 = session.resource('s3')
buck = s3.Bucket('Bucket Name')

Resources