I am trying to find the behavior of azure blob storage with AD authentication when uploading took more than 90 min for a single big file, unfortunately my internet is quite fast and my disk can't fit TB scale file, so I am trying to simulate slow upload
I tried the following code
import os
from io import BufferedReader, FileIO
class ProgressFile(BufferedReader):
# For binary opening only
def __init__(self, filename, read_callback):
f = FileIO(file=filename, mode='r')
self._read_callback = read_callback
super().__init__(raw=f)
# I prefer Pathlib but this should still support 2.x
self.length = os.stat(filename).st_size
def read(self, size=None):
calc_sz = size
if not calc_sz:
calc_sz = self.length - self.tell()
self._read_callback(position=self.tell(), read_size=calc_sz, total=self.length)
return super(ProgressFile, self).read(size)
def my_callback(position, read_size, total):
if position > 0 and position <= 4194304:
time.sleep(5520)
print("position: {position}, read_size: {read_size}, total: {total}".format(position=position,
read_size=read_size,
total=total))
myfile = ProgressFile(filename='./testfile', read_callback=my_callback)
from azure.identity import ClientSecretCredential
token_credential = ClientSecretCredential(
)
container_client = ContainerClient(oauth_url, "containername", token_credential)
def upload(filename):
blob_client = container_client.get_blob_client("myfile")
blob_client.upload_blob(myfile, blob_type="BlockBlob")
print("finish uploading")
upload(int(time.time()))
However I don't see token expire error, even after 90 min
In what circumstance does token expiration appears?
As you are using azure.identity.ClientSecretCredential, it renews the token when it is close to expiration.
(I work in Microsoft Azure SDK team)
Related
Here is the code
from azure.identity import ClientSecretCredential
token_credential = ClientSecretCredential(
"",# tenant id
"",# active directory application id
"", # active directory application secret
)
blob_service_client = BlobServiceClient(account_url=oauth_url, credential=token_credential)
def listcontainer():
from azure.storage.blob import BlobServiceClient
con = blob_service_client.list_containers()
for x in con:
print(x)
while True:
end = int(time.time())
if end - start > 4800:
break
else:
print("run time in minute: ", (end - start) / 60)
try:
listcontainer()
except Exception as e:
print("exception reached")
print(e)
break
time.sleep(60)
I set BlobServiceClient once, and I expect an exception to be reached after 90min
However I don't see that happening
In this doc
https://learn.microsoft.com/en-us/azure/active-directory/develop/active-directory-configurable-token-lifetimes
The default lifetime of an access token is variable. When issued, an access token's default lifetime is assigned a random value ranging between 60-90 minutes (75 minutes on average). The default lifetime also varies depending on the client application requesting the token or if conditional access is enabled in the tenant. For more information, see Access token lifetime.
What does expiration pertain to in this case?
The token does expire however SDK takes care of renewing it automatically when that happens. As a user, generally speaking you need not worry about it.
I am trying to to download a list of csv files from an Azure Blob Storage using a shared SAS token, but I am getting all sorts of errors.
I tried looking this up and tried multiple code samples from contributors on Slackoverflow and Azure documentation. here is the final state of the code sample I constructed from those sources! It tries to download the list of csv files in a pooled manner (blob storage contains 200 csv files):
NB: I left commented code snippets to show different approaches I tried testing. sorry if they are confusing!
from itertools import tee
from multiprocessing import Process
from multiprocessing.pool import ThreadPool
import os
from azure.storage.blob import BlobServiceClient, BlobClient
from azure.storage.blob import ContentSettings, ContainerClient
#from azure.storage.blob import BlockBlobService
STORAGEACCOUNTURL = "https://myaccount.blob.core.windows.net"
STORAGEACCOUNTKEY = "sv=2020-08-04&si=blobpolicyXYZ&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CONTAINERNAME = "mycontainer"
##BLOBNAME = "??"
sas_url = 'https://myaccount.blob.core.windows.net/mycontainer/mydir?sv=2020-08-04&si=blobpolicyXYZ&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
LOCAL_BLOB_PATH = "./downloads"
class AzureBlobFileDownloader:
def __init__(self):
print("Intializing AzureBlobFileDownloader")
# Initialize the connection to Azure storage account
self.blob_service_client_instance = ContainerClient.from_container_url #BlobClient.from_blob_url(sas_url) #BlobServiceClient(account_url=STORAGEACCOUNTURL, credential=STORAGEACCOUNTKEY)
#self.blob_client_instance = self.blob_service_client_instance.get_blob_client(CONTAINERNAME, BLOBNAME)
#self.blob_service_client = BlobServiceClient.from_connection_string(MY_CONNECTION_STRING)
#self.my_container = self.blob_service_client.get_container_client(MY_BLOB_CONTAINER)
#self.blob_service_client = BlockBlobService("storage_account",sas_token="?sv=2018-03-28&ss=bfqt&srt=sco&sp=rwdlacup&se=2019-04-24T10:01:58Z&st=2019-04-23T02:01:58Z&spr=https&sig=xxxxxxxxx")
#self.my_container = self.blob_service_client.get_blob_to_path("container_name","blob_name","local_file_path")
def save_blob(self,file_name,file_content):
# Get full path to the file
download_file_path = os.path.join(LOCAL_BLOB_PATH, file_name)
# for nested blobs, create local path as well!
os.makedirs(os.path.dirname(download_file_path), exist_ok=True)
with open(download_file_path, "wb") as file:
file.write(file_content)
def download_all_blobs_in_container(self):
# get a list of blobs
my_blobs = self.blob_service_client_instance.get_block_list() #list_blobs() #self.blob_client_instance.list_blobs() download_blob() #
print(my_blobs)
#iterate through the iterable object for testing purposes, maybe wrong approach!
result, result_backup = tee(my_blobs)
print("**first iterate**")
for i, r in enumerate(result):
print(r)
#start downloading my_blobs
result = self.run(my_blobs)
print(result)
def run(self,blobs):
# Download 3 files at a time!
with ThreadPool(processes=int(3)) as pool:
return pool.map(self.save_blob_locally, blobs)
def save_blob_locally(self,blob):
file_name = blob.name
print(file_name)
bytes = self.blob_service_client_instance.get_blob_client(CONTAINERNAME,blob).download_blob().readall()
# Get full path to the file
download_file_path = os.path.join(LOCAL_BLOB_PATH, file_name)
# for nested blobs, create local path as well!
os.makedirs(os.path.dirname(download_file_path), exist_ok=True)
with open(download_file_path, "wb") as file:
file.write(bytes)
return file_name
# Initialize class and download files
azure_blob_file_downloader = AzureBlobFileDownloader()
azure_blob_file_downloader.download_all_blobs_in_container()
could someone help me get to achieve this task in python:
get a list of all files in the blob storage, those files names are prefixed with part-
download them to a folder locally
thanks
could someone help me get to achieve this task in python:
get a list of all files in the blob storage, those files names are prefixed with part-
To List all the blobs whose prefix is "part-" you can use blob_service.list_blobs(<Container Name>, prefix="<Your Prefix>"). Below is the code to get the list of blobs for the same.
print("\nList blobs in the container")
generator = blob_service.list_blobs(CONTAINER_NAME, prefix="part-")
for blob in generator:
print("\t Blob name: " + blob.name)
download them to a folder locally
To download the blob you can use blob_client = blob_service.get_blob_to_path(<Container Name>,<Blob Name>,<File Path>). Below is the code to download the blob as per your requirement.
blob_client = blob_service.get_blob_to_path(CONTAINER_NAME,blob.name,fname)
Below is the complete code that worked for us which achieves your requirement.
import os
from azure.storage.blob import BlockBlobService
ACCOUNT_NAME = "<Your_ACCOUNT_NAME>"
ACCOUNT_KEY = "<YOUR_ACCOUNT_KEY>"
CONTAINER_NAME = "<YOUR_CONTAINER_NAME>"
LOCAL_BLOB_PATH = "C:\\<YOUR_PATH>\\downloadedfiles"
blob_service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
# Lists All Blobs which has a prefic of part-
print("\nList blobs in the container")
generator = blob_service.list_blobs(CONTAINER_NAME, prefix="part-")
for blob in generator:
print("\t Blob name: " + blob.name)
# Downloading the blob to a folder
for blob in generator:
# Adds blob name to the path
fname = os.path.join(LOCAL_BLOB_PATH, blob.name)
print(f'Downloading {blob.name} to {fname}')
# Downloading blob into file
blob_client = blob_service.get_blob_to_path(CONTAINER_NAME,blob.name,fname)
RESULT :
Files in my Storage Account
Files in my Local Folder
Updated Answer
blob_service = BlockBlobService(account_name=ACCOUNT_NAME,account_key=None,sas_token=SAS_TOKEN)
# Lists All Blobs which has a prefic of part-
print("\nList blobs in the container")
generator = blob_service.list_blobs(CONTAINER_NAME, prefix="directory1"+"/"+"part-")
for blob in generator:
print("\t Blob name: " + blob.name)
# Downloading the blob to a folder
for blob in generator:
# Adds blob name to the path
fname = os.path.join(LOCAL_BLOB_PATH, blob.name)
print(f'Downloading {blob.name} to {fname}')
# Downloading blob into file
blob_client = blob_service.get_blob_to_path(CONTAINER_NAME,blob.name,fname)
In a Google Cloud function (python 3.7) , I need to fetch the compliance state of all VMs in a given location in a project.
From available google documentation here I could see the REST API format:
https://cloud.google.com/compute/docs/os-configuration-management/view-compliance#view_compliance_state
On searching for the client library here , I found this:
class google.cloud.osconfig_v1alpha.types.ListInstanceOSPoliciesCompliancesRequest(mapping=None, *, ignore_unknown_fields=False, **kwargs)[source]
Bases: proto.message.Message
A request message for listing OS policies compliance data for all Compute Engine VMs in the given location.
parent
Required. The parent resource name.
Format: projects/{project}/locations/{location}
For {project}, either Compute Engine project-number or project-id can be provided.
Type
str
page_size
The maximum number of results to return.
Type
int
page_token
A pagination token returned from a previous call to ListInstanceOSPoliciesCompliances that indicates where this listing should continue from.
Type
str
filter
If provided, this field specifies the criteria that must be met by a InstanceOSPoliciesCompliance API resource to be included in the response.
Type
str
And the response class as:
class google.cloud.osconfig_v1alpha.types.ListInstanceOSPoliciesCompliancesResponse(mapping=None, *, ignore_unknown_fields=False, **kwargs)[source]
Bases: proto.message.Message
A response message for listing OS policies compliance data for all Compute Engine VMs in the given location.
instance_os_policies_compliances
List of instance OS policies compliance objects.
Type
Sequence[google.cloud.osconfig_v1alpha.types.InstanceOSPoliciesCompliance]
next_page_token
The pagination token to retrieve the next page of instance OS policies compliance objects.
Type
str
property raw_page
But I am not sure how to use this information in the python code.
I have written this but not sure if this is correct:
from google.cloud.osconfig_v1alpha.services.os_config_zonal_service import client
from google.cloud.osconfig_v1alpha.types import ListInstanceOSPoliciesCompliancesRequest
import logging
logger = logging.getLogger(__name__)
import os
def handler():
try:
project_id = os.environ["PROJECT_ID"]
location = os.environ["ZONE"]
#list compliance state
request = ListInstanceOSPoliciesCompliancesRequest(
parent=f"projects/{project}/locations/{location}")
response = client.instance_os_policies_compliance(request)
return response
except Exception as e:
logger.error("Unable to get compliance - %s " % str(e))
I could not find any usage example for the client library methods anywhere.
Could someone please help me here?
EDIT:
This is what I am using now:
from googleapiclient.discovery import build
def list_policy_compliance():
projectId = "my_project"
zone = "my_zone"
try:
service = build('osconfig', 'v1alpha', cache_discovery=False)
compliance_response = service.projects().locations(
).instanceOsPoliciesCompliances().list(
parent='projects/%s/locations/%s' % (
projectId, zone)).execute()
return compliance_response
except Exception as e:
raise Exception()
Something like this should work:
from google.cloud import os_config_v1alpha as osc
def handler():
client = osc.OsConfigZonalService()
project_id = "my_project"
location = "my_gcp_zone"
parent = f"projects/{project_id}/locations/{location}"
response = client.list_instance_os_policies_compliances(
parent=parent
)
# response is an iterable yielding
# InstanceOSPoliciesCompliance objects
for result in response:
# do something with result
...
You can also construct the request like this:
response = client.list_instance_os_policies_compliances(
request = {
"parent": parent
}
)
Answering my own question here , this is what I used:
from googleapiclient.discovery import build
def list_policy_compliance():
projectId = "my_project"
zone = "my_zone"
try:
service = build('osconfig', 'v1alpha', cache_discovery=False)
compliance_response = service.projects().locations(
).instanceOsPoliciesCompliances().list(
parent='projects/%s/locations/%s' % (
projectId, zone)).execute()
return compliance_response
except Exception as e:
raise Exception()
I have a single container with around 200k images on my blob storage. I want to write a script in Python that copies out batches of 20k of these images to new containers called something like imageset1, imageset2, ..., imageset20 (the last container will have less than 20k images in it which is fine).
I have the following so far:
from azure.storage.blob import BlockBlobService
from io import BytesIO from shutil
import copyfileobj
with BytesIO() as input_blob:
with BytesIO() as output_blob:
block_blob_service = BlockBlobService(account_name='my_account_name', account_key='my_account_key')
# Download as a stream
block_blob_service.get_blob_to_stream('mycontainer', 'myinputfilename', input_blob)
# Here is where I want to chunk up the container contents into batches of 20k
# Then I want to write the above to a set of new containers using, I think, something like this...
block_blob_service.create_blob_from_stream('mycontainer', 'myoutputfilename', output_blob)
It's the chunking up the contents of a container and writing the results out to new containers which I don't know how to do. Can anyone help?
here is my sample code to realize your needs, and it works on my container.
from azure.storage.blob.baseblobservice import BaseBlobService
account_name = '<your account name>'
account_key = '<your account key>'
container_name = '<the source container name>'
blob_service = BaseBlobService(
account_name=account_name,
account_key=account_key
)
blobs = blob_service.list_blobs(container_name)
# The target container index starts with 1
container_index = 1
# The blob number in new container, such as 3 in my testing
num_per_container = 3
count = 0
# The prefix of new container name
prefix_of_new_container = 'imageset'
flag_of_new_container = False
for blob in blobs:
if flag_of_new_container == False:
flag_of_new_container = blob_service.create_container("%s%d" % (prefix_of_new_container, container_index))
print(blob.name, "%s%d" % (prefix_of_new_container,container_index))
blob_service.copy_blob("%s%d" % (prefix_of_new_container, container_index), blob.name, "https://%s.blob.core.windows.net/%s/%s" % (account_name, container_name, blob.name))
count += 1
if count == num_per_container:
container_index += 1
count = 0
flag_of_new_container = False
Note: I only use BaseBlobService because it's enough for your needs, even for AppendBlob or PageBlob. Also, you can use BlockBlobService instead of it.
I am using the Google python script to upload videos.
#!/usr/bin/python
import http.client #httplib
import httplib2
import os
import random
import sys
import time
from apiclient.discovery import build
from apiclient.errors import HttpError
from apiclient.http import MediaFileUpload
from oauth2client.client import flow_from_clientsecrets
from oauth2client.file import Storage
from oauth2client.tools import argparser, run_flow
# Explicitly tell the underlying HTTP transport library not to retry, since
# we are handling retry logic ourselves.
httplib2.RETRIES = 1
# Maximum number of times to retry before giving up.
MAX_RETRIES = 10
# Always retry when these exceptions are raised.
RETRIABLE_EXCEPTIONS = (httplib2.HttpLib2Error, IOError, http.client.NotConnected,
http.client.IncompleteRead, http.client.ImproperConnectionState,
http.client.CannotSendRequest, http.client.CannotSendHeader,
http.client.ResponseNotReady, http.client.BadStatusLine)
# Always retry when an apiclient.errors.HttpError with one of these status
# codes is raised.
RETRIABLE_STATUS_CODES = [500, 502, 503, 504]
# The CLIENT_SECRETS_FILE variable specifies the name of a file that contains
# the OAuth 2.0 information for this application, including its client_id and
# client_secret. You can acquire an OAuth 2.0 client ID and client secret from
# the Google Developers Console at
# https://console.developers.google.com/.
# Please ensure that you have enabled the YouTube Data API for your project.
# For more information about using OAuth2 to access the YouTube Data API, see:
# https://developers.google.com/youtube/v3/guides/authentication
# For more information about the client_secrets.json file format, see:
# https://developers.google.com/api-client-library/python/guide/aaa_client_secrets
CLIENT_SECRETS_FILE = "client_secrets.json"
# This OAuth 2.0 access scope allows an application to upload files to the
# authenticated user's YouTube channel, but doesn't allow other types of access.
YOUTUBE_UPLOAD_SCOPE = "https://www.googleapis.com/auth/youtube.upload"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
# This variable defines a message to display if the CLIENT_SECRETS_FILE is
# missing.
MISSING_CLIENT_SECRETS_MESSAGE = """
WARNING: Please configure OAuth 2.0
To make this sample run you will need to populate the client_secrets.json file
found at:
%s
with information from the Developers Console
https://console.developers.google.com/
For more information about the client_secrets.json file format, please visit:
https://developers.google.com/api-client-library/python/guide/aaa_client_secrets
""" % os.path.abspath(os.path.join(os.path.dirname(__file__),
CLIENT_SECRETS_FILE))
VALID_PRIVACY_STATUSES = ("public", "private", "unlisted")
def get_authenticated_service(args):
flow = flow_from_clientsecrets(CLIENT_SECRETS_FILE,
scope=YOUTUBE_UPLOAD_SCOPE,
message=MISSING_CLIENT_SECRETS_MESSAGE)
storage = Storage("%s-oauth2.json" % sys.argv[0])
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = run_flow(flow, storage, args)
return build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
http=credentials.authorize(httplib2.Http()))
def initialize_upload(youtube, options):
tags = None
if options.keywords:
tags = options.keywords.split(",")
body=dict(
snippet=dict(
title=options.title,
description=options.description,
tags=tags,
categoryId=options.category
),
status=dict(
privacyStatus=options.privacyStatus
)
)
# Call the API's videos.insert method to create and upload the video.
insert_request = youtube.videos().insert(
part=",".join(body.keys()),
body=body,
# The chunksize parameter specifies the size of each chunk of data, in
# bytes, that will be uploaded at a time. Set a higher value for
# reliable connections as fewer chunks lead to faster uploads. Set a lower
# value for better recovery on less reliable connections.
#
# Setting "chunksize" equal to -1 in the code below means that the entire
# file will be uploaded in a single HTTP request. (If the upload fails,
# it will still be retried where it left off.) This is usually a best
# practice, but if you're using Python older than 2.6 or if you're
# running on App Engine, you should set the chunksize to something like
# 1024 * 1024 (1 megabyte).
media_body=MediaFileUpload(options.file, chunksize=-1, resumable=True)
)
resumable_upload(insert_request)
# This method implements an exponential backoff strategy to resume a
# failed upload.
def resumable_upload(insert_request):
response = None
error = None
retry = 0
while response is None:
try:
print ("Uploading file...")
status, response = insert_request.next_chunk()
if 'id' in response:
print ("Video id '%s' was successfully uploaded." % response['id'])
else:
exit("The upload failed with an unexpected response: %s" % response)
except HttpError as e:
if e.resp.status in RETRIABLE_STATUS_CODES:
error = "A retriable HTTP error %d occurred:\n%s" % (e.resp.status,
e.content)
else:
raise
except RETRIABLE_EXCEPTIONS as e:
error = "A retriable error occurred: %s" % e
if error is not None:
print (error)
retry += 1
if retry > MAX_RETRIES:
exit("No longer attempting to retry.")
max_sleep = 2 ** retry
sleep_seconds = random.random() * max_sleep
print ("Sleeping %f seconds and then retrying..." % sleep_seconds)
time.sleep(sleep_seconds)
if __name__ == '__main__':
argparser.add_argument("--file", required=True, help="Video file to upload")
argparser.add_argument("--title", help="Video title", default="Test Title")
argparser.add_argument("--description", help="Video description",
default="Test Description")
argparser.add_argument("--category", default="22",
help="Numeric video category. " +
"See https://developers.google.com/youtube/v3/docs/videoCategories/list")
argparser.add_argument("--keywords", help="Video keywords, comma separated",
default="")
argparser.add_argument("--privacyStatus", choices=VALID_PRIVACY_STATUSES,
default=VALID_PRIVACY_STATUSES[0], help="Video privacy status.")
args = argparser.parse_args()
if not os.path.exists(args.file):
exit("Please specify a valid file using the --file= parameter.")
youtube = get_authenticated_service(args)
try:
initialize_upload(youtube, args)
except HttpError as e:
print ("An HTTP error %d occurred:\n%s" % (e.resp.status, e.content))
The problem is the --description parameter. Only allow put one text line. And i need to put several lines with line jumps ('\n'). ¿it is possible to do this from another way?
Will be wonderful if this parameter (or other param) would allow a file text path to upload the description, like the "--file" parameter does.
There is something i can i do to solve this?
Or maybe one place where i'll can to contact with google developers to ask them if is posible to reimplement the initialize_upload(youtube, args) function to get it works like i say?
Yes it is possible!!
We have to add the --description-file option.
Google please, do a complete manual of your API!!!