How to upload folder from local to GCP bucket using python - python-3.x

I am following this link and getting some error:
How to upload folder on Google Cloud Storage using Python API
I have saved model in container environment and from there I want to copy to GCP bucket.
Here is my code:
storage_client = storage.Client(project='*****')
def upload_local_directory_to_gcs(local_path, bucket, gcs_path):
bucket = storage_client.bucket(bucket)
assert os.path.isdir(local_path)
for local_file in glob.glob(local_path + '/**'):
print(local_file)
print("this is bucket",bucket)
blob = bucket.blob(gcs_path)
print("here")
blob.upload_from_filename(local_file)
print("done")
path="/pythonPackage/trainer/model_mlm_demo" #this is local absolute path where my folder is. Folder name is **model_mlm_demo**
buc="py*****" #this is my GCP bucket address
gcs="model_mlm_demo2/" #this is the new folder that I want to store files in GCP
upload_local_directory_to_gcs(local_path=path, bucket=buc, gcs_path=gcs)
/pythonPackage/trainer/model_mlm_demo has 3 files in it config, model.bin and arguments.bin`
ERROR
The codes doesn't throw any error, but there is no files uploaded in GCP bucket. It just creates empty folder.

What I can see the error is, you don't need to pass the gs:// as the bucket parameter. Actually, here is an example you may need to check out,
https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
def upload_blob(bucket_name, source_file_name, destination_blob_name):
"""Uploads a file to the bucket."""
# The ID of your GCS bucket
# bucket_name = "your-bucket-name"
# The path to your file to upload
# source_file_name = "local/path/to/file"
# The ID of your GCS object
# destination_blob_name = "storage-object-name"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(source_file_name)
print(
"File {} uploaded to {}.".format(
source_file_name, destination_blob_name
)
)

I have reproduced your issue and the below code snippet works fine. I have updated the code based on folders and names you have mentioned in the question. Let me know if you have any issues.
import os
import glob
from google.cloud import storage
storage_client = storage.Client(project='')
def upload_local_directory_to_gcs(local_path, bucket, gcs_path):
bucket = storage_client.bucket(bucket)
assert os.path.isdir(local_path)
for local_file in glob.glob(local_path + '/**'):
print(local_file)
print("this is bucket", bucket)
filename=local_file.split('/')[-1]
blob = bucket.blob(gcs_path+filename)
print("here")
blob.upload_from_filename(local_file)
print("done")
# this is local absolute path where my folder is. Folder name is **model_mlm_demo**
path = "/pythonPackage/trainer/model_mlm_demo"
buc = "py*****" # this is my GCP bucket address
gcs = "model_mlm_demo2/" # this is the new folder that I want to store files in GCP
upload_local_directory_to_gcs(local_path=path, bucket=buc, gcs_path=gcs)

I just came across the gcsfs library which seems to be also about better interfaces
You could copy an entire directory into a gcs location like this:
def upload_to_gcs(src_dir: str, gcs_dst: str):
fs = gcsfs.GCSFileSystem()
fs.put(src_dir, gcs_dst, recursive=True)

I figured out a way using subprocess to upload model artefacts in GCP bucket.
import subprocess
subprocess.call('gsutil cp -r source_folder_in_local gs://*****/folder_name', shell=True, stdout=subprocess.PIPE)
If gsutil is not installed. You can install using this link:
https://cloud.google.com/storage/docs/gsutil_install

Related

Python code to download list of csv files from Azure Blob Storage using SAS token

I am trying to to download a list of csv files from an Azure Blob Storage using a shared SAS token, but I am getting all sorts of errors.
I tried looking this up and tried multiple code samples from contributors on Slackoverflow and Azure documentation. here is the final state of the code sample I constructed from those sources! It tries to download the list of csv files in a pooled manner (blob storage contains 200 csv files):
NB: I left commented code snippets to show different approaches I tried testing. sorry if they are confusing!
from itertools import tee
from multiprocessing import Process
from multiprocessing.pool import ThreadPool
import os
from azure.storage.blob import BlobServiceClient, BlobClient
from azure.storage.blob import ContentSettings, ContainerClient
#from azure.storage.blob import BlockBlobService
STORAGEACCOUNTURL = "https://myaccount.blob.core.windows.net"
STORAGEACCOUNTKEY = "sv=2020-08-04&si=blobpolicyXYZ&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CONTAINERNAME = "mycontainer"
##BLOBNAME = "??"
sas_url = 'https://myaccount.blob.core.windows.net/mycontainer/mydir?sv=2020-08-04&si=blobpolicyXYZ&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
LOCAL_BLOB_PATH = "./downloads"
class AzureBlobFileDownloader:
def __init__(self):
print("Intializing AzureBlobFileDownloader")
# Initialize the connection to Azure storage account
self.blob_service_client_instance = ContainerClient.from_container_url #BlobClient.from_blob_url(sas_url) #BlobServiceClient(account_url=STORAGEACCOUNTURL, credential=STORAGEACCOUNTKEY)
#self.blob_client_instance = self.blob_service_client_instance.get_blob_client(CONTAINERNAME, BLOBNAME)
#self.blob_service_client = BlobServiceClient.from_connection_string(MY_CONNECTION_STRING)
#self.my_container = self.blob_service_client.get_container_client(MY_BLOB_CONTAINER)
#self.blob_service_client = BlockBlobService("storage_account",sas_token="?sv=2018-03-28&ss=bfqt&srt=sco&sp=rwdlacup&se=2019-04-24T10:01:58Z&st=2019-04-23T02:01:58Z&spr=https&sig=xxxxxxxxx")
#self.my_container = self.blob_service_client.get_blob_to_path("container_name","blob_name","local_file_path")
def save_blob(self,file_name,file_content):
# Get full path to the file
download_file_path = os.path.join(LOCAL_BLOB_PATH, file_name)
# for nested blobs, create local path as well!
os.makedirs(os.path.dirname(download_file_path), exist_ok=True)
with open(download_file_path, "wb") as file:
file.write(file_content)
def download_all_blobs_in_container(self):
# get a list of blobs
my_blobs = self.blob_service_client_instance.get_block_list() #list_blobs() #self.blob_client_instance.list_blobs() download_blob() #
print(my_blobs)
#iterate through the iterable object for testing purposes, maybe wrong approach!
result, result_backup = tee(my_blobs)
print("**first iterate**")
for i, r in enumerate(result):
print(r)
#start downloading my_blobs
result = self.run(my_blobs)
print(result)
def run(self,blobs):
# Download 3 files at a time!
with ThreadPool(processes=int(3)) as pool:
return pool.map(self.save_blob_locally, blobs)
def save_blob_locally(self,blob):
file_name = blob.name
print(file_name)
bytes = self.blob_service_client_instance.get_blob_client(CONTAINERNAME,blob).download_blob().readall()
# Get full path to the file
download_file_path = os.path.join(LOCAL_BLOB_PATH, file_name)
# for nested blobs, create local path as well!
os.makedirs(os.path.dirname(download_file_path), exist_ok=True)
with open(download_file_path, "wb") as file:
file.write(bytes)
return file_name
# Initialize class and download files
azure_blob_file_downloader = AzureBlobFileDownloader()
azure_blob_file_downloader.download_all_blobs_in_container()
could someone help me get to achieve this task in python:
get a list of all files in the blob storage, those files names are prefixed with part-
download them to a folder locally
thanks
could someone help me get to achieve this task in python:
get a list of all files in the blob storage, those files names are prefixed with part-
To List all the blobs whose prefix is "part-" you can use blob_service.list_blobs(<Container Name>, prefix="<Your Prefix>"). Below is the code to get the list of blobs for the same.
print("\nList blobs in the container")
generator = blob_service.list_blobs(CONTAINER_NAME, prefix="part-")
for blob in generator:
print("\t Blob name: " + blob.name)
download them to a folder locally
To download the blob you can use blob_client = blob_service.get_blob_to_path(<Container Name>,<Blob Name>,<File Path>). Below is the code to download the blob as per your requirement.
blob_client = blob_service.get_blob_to_path(CONTAINER_NAME,blob.name,fname)
Below is the complete code that worked for us which achieves your requirement.
import os
from azure.storage.blob import BlockBlobService
ACCOUNT_NAME = "<Your_ACCOUNT_NAME>"
ACCOUNT_KEY = "<YOUR_ACCOUNT_KEY>"
CONTAINER_NAME = "<YOUR_CONTAINER_NAME>"
LOCAL_BLOB_PATH = "C:\\<YOUR_PATH>\\downloadedfiles"
blob_service = BlockBlobService(ACCOUNT_NAME, ACCOUNT_KEY)
# Lists All Blobs which has a prefic of part-
print("\nList blobs in the container")
generator = blob_service.list_blobs(CONTAINER_NAME, prefix="part-")
for blob in generator:
print("\t Blob name: " + blob.name)
# Downloading the blob to a folder
for blob in generator:
# Adds blob name to the path
fname = os.path.join(LOCAL_BLOB_PATH, blob.name)
print(f'Downloading {blob.name} to {fname}')
# Downloading blob into file
blob_client = blob_service.get_blob_to_path(CONTAINER_NAME,blob.name,fname)
RESULT :
Files in my Storage Account
Files in my Local Folder
Updated Answer
blob_service = BlockBlobService(account_name=ACCOUNT_NAME,account_key=None,sas_token=SAS_TOKEN)
# Lists All Blobs which has a prefic of part-
print("\nList blobs in the container")
generator = blob_service.list_blobs(CONTAINER_NAME, prefix="directory1"+"/"+"part-")
for blob in generator:
print("\t Blob name: " + blob.name)
# Downloading the blob to a folder
for blob in generator:
# Adds blob name to the path
fname = os.path.join(LOCAL_BLOB_PATH, blob.name)
print(f'Downloading {blob.name} to {fname}')
# Downloading blob into file
blob_client = blob_service.get_blob_to_path(CONTAINER_NAME,blob.name,fname)

List files in azure file storage and get the most recent file

I have files in azure file storage, so I listed the files by the date upload and then I want to select the most recent file uploaded.
So to do this, I created a function that should have returned me, the list of the files. However when I see the output, it return only one file and the other are missing.
Here is my code:
file_service = FileService(account_name='', account_key='')
generator = list(file_service.list_directories_and_files(''))
def list_files_in(generator,file_service):
list_files=[]
for file_or_dir in generator:
file_in = file_service.get_file_properties(share_name='', directory_name="", file_name=file_or_dir.name)
file_date= file_in.properties.last_modified.date()
list_tuple = (file_date,file_or_dir.name)
list_files.append(list_tuple)
return list_files
To get the latest files in azure blob you need to write the logic to get that, below are the code which will give you the same :
For Files
result = file_service.list_directories_and_files(share_name, directory_name)
for file_or_dir in result:
if isinstance(file_or_dir, File):
file = file_service.get_file_properties(share_name, directory_name, file_name, timeout=None, snapshot=None)
print(file_or_dir.name, file.properties.last_modified)
For Blob
from azure.storage.blob import ContainerClient
container = ContainerClient.from_connection_string(conn_str={your_connection_string}, container_name = {your_container_name})
for blob in container.list_blobs():
print(f'{blob.name} : {blob.last_modified}')
You can get the key's and account details from the azure portal for the account access. In this blob.last_modified will give you latest blob item.

Move file from /tmp folder to Google Cloud Storage bucket

I originally posted this question when I was having trouble getting my python cloud function to create and write to a new file. Since then I've managed to create a csv in the /tmp directory but am struggling to find a way to move that file into my bucket's folder where the original csv was uploaded.
Is it possible to do this? I've looked through the Google Cloud Storage docs and tried using the blob.download_to_filename() and bucket.copy_blob() methods but am currently getting the error: FileNotFoundError: [Errno 2] No such file or directory: 'my-project.appspot.com/my-folder/my-converted-file.csv'
Appreciate any help or advice!
to move that file into my bucket
Here is an example. Bear in mind:
Don't copy and paste without thinking.
The code snippet is only to show the idea - it won't work as is. Modifications are required to fit into your context and requirements.
The _crc32sum function was not developed by me.
I did not test the code. It is just from my head with copying some elements from different public sources.
Here is the code:
import base64
import crc32c
import os
from google.cloud import exceptions
from google.cloud import storage
# =====> ==============================
# a function to calculate crc32c hash
def _crc32sum(filename: str, blocksize: int = 65536) -> int:
"""Calculate the crc32c hash for a file with the provided name
:param filename: the name of the file
:param blocksize: the size of the block for the file reading
:return: the calculated crc32c hash for the given file
"""
checksum = 0
with open(filename, "rb") as f_ref:
for block in iter(lambda: f_ref.read(blocksize), b""):
checksum = crc32c.crc32(block, checksum)
return checksum & 0xffffffff
# =====> ==============================
# use the default project in the client initialisation
CS = storage.Client()
lcl_file_name = "/tmp/my-local-file.csv"
tgt_bucket_name = "my-bucket-name"
tgt_object_name = "prefix/another-prefix/my-target-file.csv"
# =====> ==============================
# =====> ==============================
# =====> the process strats here
# https://googleapis.dev/python/storage/latest/_modules/google/cloud/storage/client.html#Client.lookup_bucket
gcs_tgt_bucket_ref = CS.lookup_bucket(tgt_bucket_name)
# check if the target bucket does exist
if gcs_tgt_bucket_ref is None:
# handle incorrect bucket name or its absence
# most likely we are to finish the execution here rather than 'pass'
pass
# calculate the hash for the local file
lcl_crc32c = _crc32sum(lcl_file_name)
base64_crc32c = base64.b64encode(lcl_crc32c.to_bytes(
length=4, byteorder='big')).decode('utf-8')
# check if the file/object in the bucket already exists
# https://googleapis.dev/python/storage/latest/_modules/google/cloud/storage/bucket.html#Bucket.blob
gcs_file_ref = gcs_tgt_bucket_ref.blob(tgt_object_name)
# https://googleapis.dev/python/storage/latest/_modules/google/cloud/storage/blob.html#Blob.exists
if gcs_file_ref.exists():
gcs_file_ref.reload()
# compare crc32c hashes - between the local file and the gcs file/object
if base64_crc32c != gcs_file_ref.crc32c:
# the blob file/object in the GCS has a different hash
# the blob file/object should be deleted and a new one to be uploaded
# https://googleapis.dev/python/storage/latest/_modules/google/cloud/storage/blob.html#Blob.delete
gcs_file_ref.delete()
else:
# the file/object is already in the bucket
# most likely we are to finish the execution here rather than 'pass'
pass
# upload file to the target bucket
# reinit the reference in case the target file/object was deleted
gcs_file_ref = gcs_tgt_bucket_ref.blob(tgt_file_name)
gcs_file_ref.crc32c = base64_crc32c
with open(lcl_file_name, 'rb') as file_obj:
try:
gcs_file_ref.metadata = {
"custom-metadata-key": "custom-metadata-value"
}
# https://googleapis.dev/python/storage/latest/_modules/google/cloud/storage/blob.html#Blob.upload_from_file
gcs_file_ref.upload_from_file(
file_obj=file_obj, content_type="text/csv", checksum="crc32c")
except exceptions.GoogleCloudError as gc_err:
# handle the exception here
# don't forget to delete the local file if it is not required anymore
# most likely we are to finish the execution here rather than 'pass'
pass
# clean behind
if lcl_file_name and os.path.exists(lcl_file_name):
os.remove(lcl_file_name)
# =====> the process ends here
# =====> ==============================
Let me know if there are significant mistakes, and I modify the example.

Upload Gzip file using Boto3

i am trying to upload files to S3 before that i am trying to Gzip files, if you see the code below, the files uploaded to the S3 have no change in the size, so i am trying to figure out if i have missed something.
import gzip
import shutil
from io import BytesIO
def upload_gzipped(bucket, key, fp, compressed_fp=None, content_type='text/plain'):
"""Compress and upload the contents from fp to S3.
If compressed_fp is None, the compression is performed in memory.
"""
if not compressed_fp:
compressed_fp = BytesIO()
with gzip.GzipFile(fileobj=compressed_fp, mode='wb') as gz:
shutil.copyfileobj(fp, gz)
compressed_fp.seek(0)
bucket.upload_fileobj(
compressed_fp,
key,
{'ContentType': content_type, 'ContentEncoding': 'gzip'})
Courtesy Link for the source
And this is how i am using this fucntion, so basically reading files as stream from SFTP and then trying to Gzip them and then write them to S3.
with pysftp.Connection(host_name, username=user, password=password, cnopts=cnopts, port=int(port)) as sftp:
list_of_files = sftp.listdir('{}{}'.format(base_path, file_path))
is_file_found = False
for file_name in list_of_files:
if entity_name in str(file_name.lower()):
is_file_found = True
flo = BytesIO()
# Step 1: Read File Using SFTP as input Stream
sftp.getfo('{}{}/{}'.format(base_path, file_path, file_name), flo)
s3_destination_key = '{}/{}'.format(s3_path, file_name)
# Step 2: Write files to desitination S3
logger.info('Moving file to S3 {} '.format(s3_destination_key))
# Creating a bucket resource to use bucket object for file upload
input_bucket_object = S3.Bucket(environment_config['S3_INBOX_BUCKET'])
flo.seek(0)
upload_gzipped(input_bucket_object, s3_destination_key, flo)
It seems like the upload_gzipped function uses shutil.copyfileobj incorrectly.
Looking at https://docs.python.org/3/library/shutil.html#shutil.copyfileobj shows that you put the source first, and destination second.
Also, you're just writing your object to a gzipped object without ever actually compressing it.
You need to compress fp into a Gzip object, then upload that specific object to S3.
I'd recommend not using that gist from github as it seems wrong.

Google cloud function with wand stopped working

I have set up 3 Google Cloud Storge buckets and 3 functions (one for each bucket) that will trigger when a PDF file is uploaded to a bucket. Functions convert PDF to png image and do further processing.
When I am trying to create a 4th bucket and similar function, strangely it is not working. Even if I copy one of the existing 3 functions, it is still not working and I am getting this error:
Traceback (most recent call last): File "/env/local/lib/python3.7/site-packages/google/cloud/functions_v1beta2/worker.py", line 333, in run_background_function _function_handler.invoke_user_function(event_object) File "/env/local/lib/python3.7/site-packages/google/cloud/functions_v1beta2/worker.py", line 199, in invoke_user_function return call_user_function(request_or_event) File "/env/local/lib/python3.7/site-packages/google/cloud/functions_v1beta2/worker.py", line 196, in call_user_function event_context.Context(**request_or_event.context)) File "/user_code/main.py", line 27, in pdf_to_img with Image(filename=tmp_pdf, resolution=300) as image: File "/env/local/lib/python3.7/site-packages/wand/image.py", line 2874, in __init__ self.read(filename=filename, resolution=resolution) File "/env/local/lib/python3.7/site-packages/wand/image.py", line 2952, in read self.raise_exception() File "/env/local/lib/python3.7/site-packages/wand/resource.py", line 222, in raise_exception raise e wand.exceptions.PolicyError: not authorized/tmp/tmphm3hiezy' # error/constitute.c/ReadImage/412`
It is baffling me why same functions are working on existing buckets but not on new one.
UPDATE:
Even this is not working (getting "cache resources exhausted" error):
In requirements.txt:
google-cloud-storage
wand
In main.py:
import tempfile
from google.cloud import storage
from wand.image import Image
storage_client = storage.Client()
def pdf_to_img(data, context):
file_data = data
pdf = file_data['name']
if pdf.startswith('v-'):
return
bucket_name = file_data['bucket']
blob = storage_client.bucket(bucket_name).get_blob(pdf)
_, tmp_pdf = tempfile.mkstemp()
_, tmp_png = tempfile.mkstemp()
tmp_png = tmp_png+".png"
blob.download_to_filename(tmp_pdf)
with Image(filename=tmp_pdf) as image:
image.save(filename=tmp_png)
print("Image created")
new_file_name = "v-"+pdf.split('.')[0]+".png"
blob.bucket.blob(new_file_name).upload_from_filename(tmp_png)
Above code is supposed to just create a copy of image file which is uploaded to bucket.
Because the vulnerability has been fixed in Ghostscript but not updated in ImageMagick, the workaround for converting PDFs to images in Google Cloud Functions is to use this ghostscript wrapper and directly request the PDF conversion to png from Ghostscript (bypassing ImageMagick).
requirements.txt
google-cloud-storage
ghostscript==0.6
main.py
import locale
import tempfile
import ghostscript
from google.cloud import storage
storage_client = storage.Client()
def pdf_to_img(data, context):
file_data = data
pdf = file_data['name']
if pdf.startswith('v-'):
return
bucket_name = file_data['bucket']
blob = storage_client.bucket(bucket_name).get_blob(pdf)
_, tmp_pdf = tempfile.mkstemp()
_, tmp_png = tempfile.mkstemp()
tmp_png = tmp_png+".png"
blob.download_to_filename(tmp_pdf)
# create a temp folder based on temp_local_filename
# use ghostscript to export the pdf into pages as pngs in the temp dir
args = [
"pdf2png", # actual value doesn't matter
"-dSAFER",
"-sDEVICE=pngalpha",
"-o", tmp_png,
"-r300", tmp_pdf
]
# the above arguments have to be bytes, encode them
encoding = locale.getpreferredencoding()
args = [a.encode(encoding) for a in args]
#run the request through ghostscript
ghostscript.Ghostscript(*args)
print("Image created")
new_file_name = "v-"+pdf.split('.')[0]+".png"
blob.bucket.blob(new_file_name).upload_from_filename(tmp_png)
Anyway, this gets you around the issue and keeps all the processing in GCF for you. Hope it helps. Your code works for single page PDFs though. My use-case was for multipage pdf conversion, ghostscript code & solution in this question.
This actually seems to be a show stopper for ImageMagick related functionalities using PDF format. Similar code deployed by us on Google App engine via custom docker is failing with the same error on missing authorizations.
I am not sure how to edit the policy.xml file on GAE or GCF but a line there has to be changed to:
<policy domain="coder" rights="read|write" pattern="PDF" />
#Dustin: Do you have a bug link where we can see the progress ?
Update:
I fixed it on my Google app engine container by adding a line in docker image. This directly changes the policy.xml file content after imagemagick gets installed.
RUN sed -i 's/rights="none"/rights="read|write"/g' /etc/ImageMagick-6/policy.xml
This is an upstream bug in Ubuntu, we are working on a workaround for App Engine and Cloud Functions.
While we wait for the issue to be resolved in Ubuntu, I followed #DustinIngram's suggestion and created a virtual machine in Compute Engine with an ImageMagick installation. The downside is that I now have a second API that my API in App Engine has to call, just to generate the images. Having said that, it's working fine for me. This is my setup:
Main API:
When a pdf file is uploaded to Cloud Storage, I call the following:
response = requests.post('http://xx.xxx.xxx.xxx:5000/makeimages', data=data)
Where data is a JSON string with the format {"file_name": file_name}
On the API that is running on the VM, the POST request gets processed as follows:
#app.route('/makeimages', methods=['POST'])
def pdf_to_jpg():
file_name = request.form['file_name']
blob = storage_client.bucket(bucket_name).get_blob(file_name)
_, temp_local_filename = tempfile.mkstemp()
temp_local_filename_jpeg = temp_local_filename + '.jpg'
# Download file from bucket.
blob.download_to_filename(temp_local_filename)
print('Image ' + file_name + ' was downloaded to ' + temp_local_filename)
with Image(filename=temp_local_filename, resolution=300) as img:
pg_num = 0
image_files = {}
image_files['pages'] = []
for img_page in img.sequence:
img_page_2 = Image(image=img_page)
img_page_2.format = 'jpeg'
img_page_2.compression_quality = 70
img_page_2.save(filename=temp_local_filename_jpeg)
new_file_name = file_name.replace('.pdf', 'p') + str(pg_num) + '.jpg'
new_blob = blob.bucket.blob(new_file_name)
new_blob.upload_from_filename(temp_local_filename_jpeg)
print('Page ' + str(pg_num) + ' was saved as ' + new_file_name)
image_files['pages'].append({'page': pg_num, 'file_name': new_file_name})
pg_num += 1
try:
os.remove(temp_local_filename)
except (ValueError, PermissionError):
print('Could not delete the temp file!')
return jsonify(image_files)
This will download the pdf from Cloud Storage, create an image for each page, and save them back to cloud storage. The API will then return a JSON file with the list of image files created.
So, not the most elegant solution, but at least I don't need to convert the files manually.

Resources