Slow upload to azure blob storage with python - azure

The Api receive the file, than tries to create an unique blob name.
Than I upload in chunks of 4MB to the blob. Each chunk takes something about 8 seconds, is this normal? My upload speed is 110Mbps. I tried uploading a 50MB file and it took almost 2 minutes. I don't know if the azure_blob_storage version is related to this, I'm using azure-storage-blob==12.14.1
import uuid
import os
from azure.storage.blob import BlobClient, BlobBlock, BlobServiceClient
import time
import uuid
#catalog_api.route("/catalog", methods=['POST'])
def catalog():
file = request.files['file']
url_bucket, file_name, file_type = upload_to_blob(file)
def upload_to_blob(self, file):
file_name = file.filename
file_type = file.content_type
blob_client = self.generate_blob_client(file_name)
blob_url = self.upload_chunks(blob_client, file)
return blob_url, file_name, file_type
def generate_blob_client(self, file_name: str):
blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
container_client = blob_service_client.get_container_client(self.container_name)
for _ in range(self.max_blob_name_tries):
blob_name = self.generate_blob_name(file_name)
blob_client = container_client.get_blob_client(blob_name)
if not blob_client.exists():
return blob_client
raise Exception("Couldnt create the blob")
def upload_chunks(self, blob_client: BlobClient, file):
block_list=[]
chunk_size = self.chunk_size
while True:
read_data = file.read(chunk_size)
if not read_data:
print("uploaded")
break
print("uploading")
blk_id = str(uuid.uuid4())
blob_client.stage_block(block_id=blk_id,data=read_data)
block_list.append(BlobBlock(block_id=blk_id))
blob_client.commit_block_list(block_list)
return blob_client.url
```

I tried in my environment and got below results:
I tried with 50 mb file to upload blob storage account with chunk size of 4*1024*1024 from local environment to storage account it takes 45 secs.
Code:
import uuid
from azure.storage.blob import BlobBlock, BlobServiceClient
import time
connection_string="<storage account connection string >"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client('test')
blob_client = container_client.get_blob_client("file.pdf")
start=time.time()
#upload data
block_list=[]
chunk_size=4*1024*1024
with open("C:\\file.pdf",'rb') as f:
while True:
read_data = f.read(chunk_size)
if not read_data:
break # done
blk_id = str(uuid.uuid4())
blob_client.stage_block(block_id=blk_id,data=read_data)
block_list.append(BlobBlock(block_id=blk_id))
blob_client.commit_block_list(block_list)
end=time.time()
print("Time taken to upload blob:", end - start, "secs")
In the above code, I added the timing method of both start and end at end of code I used the end-start process to know the timing of uploaded file in blob storage.
Console:
Make sure your internet speed is good and also, I tried with some other internet speed it takes maximum 78secs.
Portal:

Related

GCP Storage - unable to read contents of file from Google storage bucket

I've a requirement wherein i have to call a python file stored in GCP Storage bucket, from the driver python file.
Here is the code(being run on my local m/c, but eventually will be scheduled using Airflow on GCP):
from google.cloud import storage
import os
jsonkey = 'googl-cloudstorage-key.json'
storage_client = storage.Client.from_service_account_json(jsonkey)
def download_file_from_bucket(blob_name, file_path, bucket_name):
print(f" download_file_from_bucket : blob_name file_path : {file_path}")
try:
bucket = storage_client.get_bucket(bucket_name)
print(f" bucket name : {bucket.name}")
blob = bucket.blob(blob_name)
print(f" blob.name : {blob.name}, length of blob : {blob.download_as_string} ")
with open(file_path, "w") as f:
blob.download_blob_to_file(blob, f)
download_file_from_bucket('file1.py', os.path.join(os.getcwd(),'new_file.py'),'kk_bucket_1')
The file gets downloaded into new_file.py, however the file downloaded is blank
Here is the content of the file uploaded on GCP Storage bucket :
name : file1.py
import sys
print("file1.py, customer is ", sys.argv[1])
What do i need to do, to make this work ?
Pls note :
I need to call this file from the driver file, which i plan to do using the subprocess method
Here is the code for that :
import subprocess, os
cust = ['cust1', 'cust2']
for c in cust:
print(" c -> ", c)
fileToCall = os.path.join(os.getcwd(), 'file1_1.py')
print(" file to call ", fileToCall)
subprocess.run(['python', fileToCall ,c])
I was able to get this to work using the following code :
def download_file_from_bucket(blob_name, file_path, bucket_name):
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.download_to_filename(file_path)
print("after writing to file")

How to find where an S3 multipart upload is failing in Python?

I am implementing a cron job that will upload a large daily backup file to an S3 Bucket. It works most of the time, but every once in a while, I will check the bucket, and the file size is significantly smaller than the actual size.
It should be roughly 50GB, but the last time it happened, it showed 34GB. My main problem is that I am unsure of what error to try/catch.
I am still learning Python as I go, so bare with me.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
bucket = "my-s3-backup"
s3 = boto3.resource('s3')
# Grabbing the last file, and removing the full path from the string
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
# Multipart upload function
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload))
# Custom Slack notification to inform completion
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
# Custom Slack notification to inform of failure
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
if __name__ == '__main__':
multi_part_upload()
If the script is not "failing," but it's not uploading the complete file size, what exception am I trying to catch here? Should I log output somewhere?
I'm looking through the Botocore Exceptions documentation. I'm just unsure of what to try/catch with this.
For reference, here is the file size difference:
aws s3 ls --summarize --human-readable --recursive s3://my-s3-backup/physical_db_backups/
2022-05-07 14:31:28 50.7 GiB physical_db_backups/xb_202205070101.xb.zst
2022-05-08 12:48:07 50.8 GiB physical_db_backups/xb_202205080101.xb.zst
2022-05-09 01:30:04 34.2 GiB physical_db_backups/xb_202205090101.xb.zst <--- WRONG
Alright, since I was an idiot and didn't realize the file had not completed yet, I made a couple of changes.
I edited the cron to start later.
I have created logic to determine if the backup script is running.
I may incorporate additional checks to make sure the file exists, but for now this is a working POC that has been tested.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
from time import sleep
import psutil
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
import logging
bucket = "fsn-s3-backup"
s3 = boto3.resource('s3')
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
logging.basicConfig(filename='/var/log/s3-backup.log', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode='a')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload),
ExtraArgs={'ContentType': 'application/zstd'})
logger.info("Physical Backup to S3 Complete")
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
logger.error("Physical Backup to S3 Failed: " + error)
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
def checkIfProcessRunning(processName):
for proc in psutil.process_iter():
cmdline = proc.cmdline()
if processName in cmdline:
return True
return False
if __name__ == '__main__':
backuprunning = True
while backuprunning:
logger.info("Checking if backup shell script is running")
if checkIfProcessRunning('/path/to/physical_backup.sh'):
logger.info("Backup shell script still running. Sleeping for 60s")
sleep(60)
else:
backuprunning = False
logger.info("Beginning multipart upload")
multi_part_upload()

How to create multiple receivers in Azure Eventhub to avoide duplicates?

I have one producer which is sending the events to Event hub.
I want to create 2 receivers to receive events from eventhub. How to implement that.
The code for receiver:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.eventhub import EventHubSharedKeyCredential, EventData, EventHubConsumerClient
from azure.core.exceptions import ResourceExistsError
from azure.eventhub.extensions.checkpointstoreblob import BlobCheckpointStore
#Eventhub access credentials
connection_str = ****
consumer_group = '$Default'
eventhub_name = ****
#Blobstorage Storage credentials
storage_connection_str = ****
container_name = ****
storageAccount = ****
#For checkpointing in Blob storage
checkpoint_store = BlobCheckpointStore.from_connection_string(storage_connection_str, container_name)
#Initiate BlobServiceClient to access the Blob storage
blob_service_client = BlobServiceClient.from_connection_string(storage_connection_str)
container_client = blob_service_client.get_container_client('container_name') #Dump final data to the Blob storage in append mode.
try:
container_client.create_container() #Create new Container in the service
properties = container_client.get_container_properties()
except ResourceExistsError:
print("Container already exists.")
#Instantiate a new BlobClient
#blob_client = container_client.get_blob_client("data.csv")
def get_messages():
client = EventHubConsumerClient.from_connection_string(connection_str, consumer_group, eventhub_name=eventhub_name)
def on_event_batch(partition_context, events):
#log.info("Partition {}, Received count: {}".format(partition_context.partition_id, len(events)))
print("Received event from partition {}".format(
partition_context.partition_id)) # Since no partition is defined so partition = 0 by default.
if (len(events) == 0):
client.close() # closing the client if there is no event triggered.
else:
for event in events:
list_ = event.body_as_json()
# Update checkpoint
partition_context.update_checkpoint()
try:
with client:
client.receive_batch(
on_event_batch=on_event_batch,
PARTITION="0",)
#starting_position="-1", ) # "-1" is from the beginning of the partition.
except KeyboardInterrupt:
print('Stopped receiving.')
get_messages()
I have created 2 copies of this code with names consumer1.py and consumer2.py. But both these consumers receive the same events every time.
So for example I send 100 events then I want these two consumers to run in parallel and divide those 100 events among themselves and avoiding duplicates. How to achieve this?
So finally I found the solution to create multiple consumers under the same consumer group which can consume the events parallelly and also should share the load among each other.
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.eventhub import EventHubSharedKeyCredential, EventData, EventHubConsumerClient
from azure.core.exceptions import ResourceExistsError
from azure.eventhub.extensions.checkpointstoreblob import BlobCheckpointStore
#Eventhub access credentials
connection_str = ****
consumer_group = '$Default'
eventhub_name = ****
#Blobstorage Storage credentials
storage_connection_str = ****
container_name = ****
storageAccount = ****
#For checkpointing in Blob storage
checkpoint_store = BlobCheckpointStore.from_connection_string(storage_connection_str, container_name)
#Initiate BlobServiceClient to access the Blob storage
blob_service_client = BlobServiceClient.from_connection_string(storage_connection_str)
container_client = blob_service_client.get_container_client('nsc-container')
#Dump final data to the Blob storage in append mode.
try:
container_client.create_container() #Create new Container in the service
properties = container_client.get_container_properties()
except ResourceExistsError:
print("Container already exists.")
#Instantiate a new BlobClient
#blob_client = container_client.get_blob_client("data.csv")
def get_messages():
client = EventHubConsumerClient.from_connection_string(connection_str, consumer_group, eventhub_name=eventhub_name, checkpoint_store=checkpoint_store,)
def on_event_batch(partition_context, events):
#log.info("Partition {}, Received count: {}".format(partition_context.partition_id, len(events)))
print("Received event from partition {}".format(
partition_context.partition_id)) # Since no partition is defined so partition = 0 by default.
line_count = 0
start_time = time.time()
cnt = 0
if (len(events) == 0):
client.close() # closing the client if there is no event triggered.
else:
for event in events:
list_ = event.body_as_json()
cnt += 1
# Update checkpoint
partition_context.update_checkpoint()
print("Number of events received: ",cnt)
line_count = line_count+ cnt
end_time = time.time()
run_time = end_time - start_time
print("\nTotal Received {} records in {} seconds.".format(line_count, run_time))
try:
with client:
client.receive_batch(
on_event_batch=on_event_batch,) # With specified partition_id, load-balance will be disabled
except KeyboardInterrupt:
print('Stopped receiving.')
get_messages()
Now create as many copies of the code and save them as consumer_1.py and so on. Also, make sure to keep the number of partitions equal to the number of consumers for best efficiency.

Read data from an Azure blob container into the Computer Vision service

I'm using the Azure CV module to process images, so far I have only used local images or images freely available on the web. But now I need to use the images I have stored in a storage account container.
I don't see how to do this in the documentation, E.G: this code allow to use local images:
import os
import sys
import requests
# If you are using a Jupyter notebook, uncomment the following line.
# %matplotlib inline
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
# Add your Computer Vision subscription key and endpoint to your environment variables.
if 'COMPUTER_VISION_SUBSCRIPTION_KEY' in os.environ:
subscription_key = os.environ['COMPUTER_VISION_SUBSCRIPTION_KEY']
else:
print("\nSet the COMPUTER_VISION_SUBSCRIPTION_KEY environment variable.\n**Restart your shell or IDE for changes to take effect.**")
sys.exit()
if 'COMPUTER_VISION_ENDPOINT' in os.environ:
endpoint = os.environ['COMPUTER_VISION_ENDPOINT']
analyze_url = endpoint + "vision/v3.0/analyze"
# Set image_path to the local path of an image that you want to analyze.
# Sample images are here, if needed:
# https://github.com/Azure-Samples/cognitive-services-sample-data-files/tree/master/ComputerVision/Images
image_path = "C:/Documents/ImageToAnalyze.jpg"
# Read the image into a byte array
image_data = open(image_path, "rb").read()
headers = {'Ocp-Apim-Subscription-Key': subscription_key,
'Content-Type': 'application/octet-stream'}
params = {'visualFeatures': 'Categories,Description,Color'}
response = requests.post(
analyze_url, headers=headers, params=params, data=image_data)
response.raise_for_status()
# The 'analysis' object contains various fields that describe the image. The most
# relevant caption for the image is obtained from the 'description' property.
analysis = response.json()
print(analysis)
image_caption = analysis["description"]["captions"][0]["text"].capitalize()
# Display the image and overlay it with the caption.
image = Image.open(BytesIO(image_data))
plt.imshow(image)
plt.axis("off")
_ = plt.title(image_caption, size="x-large", y=-0.1)
plt.show()
This other to use images from the web:
computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
remote_image_url = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/landmark.jpg"
'''
Describe an Image - remote
This example describes the contents of an image with the confidence score.
'''
print("===== Describe an image - remote =====")
# Call API
description_results = computervision_client.describe_image(remote_image_url )
# Get the captions (descriptions) from the response, with confidence level
print("Description of remote image: ")
if (len(description_results.captions) == 0):
print("No description detected.")
else:
for caption in description_results.captions:
print("'{}' with confidence {:.2f}%".format(caption.text, caption.confidence * 100))
And this other to read data from a storage container:
from azure.storage.blob import BlobClient
blob = BlobClient.from_connection_string(conn_str="my_connection_string", container_name="my_container", blob_name="my_blob")
with open("./BlockDestination.txt", "wb") as my_blob:
blob_data = blob.download_blob()
blob_data.readinto(my_blob)
But I don't see how to make the connection between the storage container and the CV service
Two simple options:
Not recommended: Set your blob container to "public" and simply use the full blob urls as you would use any other public URL.
Recommended: Construct SAS tokens for your files in blob storage. Append them to the full blob URL to create a "temporary private download link" which can be used to download the file as if it was public. You can also build the link outside of the CV service if you face any issues there.
A full blob URL with a SAS token should look something like this:
https://storagesamples.blob.core.windows.net/sample-container/blob1.txt?se=2019-08-03&sp=rw&sv=2018-11-09&sr=b&skoid=<skoid>&sktid=<sktid>&skt=2019-08-02T2
2%3A32%3A01Z&ske=2019-08-03T00%3A00%3A00Z&sks=b&skv=2018-11-09&sig=<signature>
https://github.com/Azure/azure-sdk-for-python/blob/master/sdk/storage/azure-storage-blob/samples/blob_samples_authentication.py#L110
# Instantiate a BlobServiceClient using a connection string
from azure.storage.blob import BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(self.connection_string)
# [START create_sas_token]
# Create a SAS token to use to authenticate a new client
from datetime import datetime, timedelta
from azure.storage.blob import ResourceTypes, AccountSasPermissions, generate_account_sas
sas_token = generate_account_sas(
blob_service_client.account_name,
account_key=blob_service_client.credential.account_key,
resource_types=ResourceTypes(object=True),
permission=AccountSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(hours=1)
)
# [END create_sas_token]
If you check the sample:
from azure.storage.blob import BlobClient
blob = BlobClient.from_connection_string(conn_str="my_connection_string", container_name="my_container", blob_name="my_blob")
with open("./BlockDestination.txt", "wb") as my_blob:
blob_data = blob.download_blob()
blob_data.readinto(my_blob)
all you need to do is get a byte array from my_blob
rather than
Read the image into a byte array
image_data = open(image_path, "rb").read()
you should
Read from the byte array
image_data = my_blob.tobytes()

Uploaded file does not show in S3

I am using boto3 using multipart upload with TransferConfig:
Every thing seems okay as the program runs without errors:
import threading,boto3,re,os,sys
from boto3.s3.transfer import TransferConfig
#create resource
s3=boto3.resource('s3',
region_name = region,
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
aws_session_token=SESSION_TOKEN)
BUCKET_NAME="my_bucket"
# the upload function
def multi_part_upload_with_s3():
# Multipart upload
config = TransferConfig(multipart_threshold=1024*25, max_concurrency=10000,
multipart_chunksize=1024*25, use_threads=True)
#file_path = os.path.dirname(__file__)+'/largefile.pdf'
file_path = "C:/Users/Documents/MyFile.out"
key_path = 'MyDir/MySubDir/'
s3.meta.client.upload_file(file_path, BUCKET_NAME, key_path,
#ExtraArgs={'ACL': 'public-read', 'ContentType':
'text/pdf'},
Config=config, Callback=ProgressPercentage(file_path))
#Not really important, just tells you what percentage of your file has uploaded.
class ProgressPercentage(object):
def __init__(self, filename):
self._filename = filename
self._size = float(os.path.getsize(filename))
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far/self._size)*100
sys.stdout.write("\r%s %s/%s (%.2f%%)" % (self._filename,
self._seen_so_far, self._size, percentage))
sys.stdout.flush()
#Now call fucntion
if __name__=='__main__':
multi_part_upload_with_s3()
Output:
C:/Users/Documents/MyFile.out 1295607/1295607.0 (100.00%)
So it appears to run without errors. However, when I look in S3, 'MySubDir' is created, but 'MyFile.out' is not in there. I thought maybe having the max concurrency in S3 was the culprit thinking it would take a while to join it back up, but I've waited for over 4 hours and nothing has shown up. I've also used other files and other uploading approaches and files would show up in 'MySubDir'.
You never specify the destination file name, only the path. Try:
file_path = "C:/Users/Documents/MyFile.out"
key_path = 'MyDir/MySubDir/MyFile.out'
s3.meta.client.upload_file(file_path, BUCKET_NAME, key_path,
#ExtraArgs={'ACL': 'public-read', 'ContentType':
'text/pdf'},
Config=config,Callback=ProgressPercentage(file_path))

Resources