With Google Cloud Function creating thumbnail of video with sub dir path - python-3.x

I am uploading Videos by rest api into google Cloud bucket and add one function to generate thumbnail with python. Code working on main directory but my video's uploaded in sub/sub/ directory so my code is not working.
import os
from google.cloud import storage
from subprocess import check_output
from videoprops import get_video_properties
client = storage.Client()
def hello_gcs(data, context):
print(context)
print(data)
if data['contentType'].startswith('video/'):
bucket = client.get_bucket(data['bucket'])
name = data['name']
file_name = '/tmp/'+ name
print(file_name)
thumbnail_file_name = '/tmp/' + name.split('.')[0] + '.jpg'
print(thumbnail_file_name)
try:
os.remove(file_name)
except OSError:
pass
try:
os.remove(thumbnail_file_name)
except OSError:
pass
print("File has been removed")
blob = bucket.get_blob(name)
blob.download_to_filename(file_name)
print("Video Downloaded")
props = get_video_properties(file_name)
if os.path.exists(file_name):
print("NEW MP4 EXISTS")
check_output('ffmpeg -itsoffset -4 -i '+file_name+' -vcodec mjpeg -vframes 1 -an -f rawvideo -s '+str(props['width'])+'x'+str(props['height'])+' '+thumbnail_file_name, shell=True)
thumbnail_blob = bucket.blob('thumbnail.jpg')
thumbnail_blob.upload_from_filename(thumbnail_file_name)
else:
print("MP4 not created")
print("uploaded")
else :
print("Not a Video")
So im only accessing tmp but not able to create folder like /tmp/Upload/Video/232/video.mp4.
Thanks
Dharmesh

Here my code for sub dir videos can generate thumbnail and upload in same dir.
import os
from google.cloud import storage
from subprocess import check_output
from videoprops import get_video_properties
client = storage.Client()
def hello_gcs(data, context):
print(context)
print(data)
if data['contentType'].startswith('video/'):
bucket = client.get_bucket(data['bucket'])
name = data['name']
os.makedirs('/tmp/'+os.path.dirname(name), exist_ok=True)
file_name = '/tmp/'+ name
print(file_name)
thumbnail_file_name = '/tmp/' + name.split('.')[0] + '.jpg'
print(thumbnail_file_name)
try:
os.remove(file_name)
except OSError:
pass
try:
os.remove(thumbnail_file_name)
except OSError:
pass
print("File has been removed")
blob = bucket.get_blob(name)
blob.download_to_filename(file_name)
print("Video Downloaded")
props = get_video_properties(file_name)
if os.path.exists(file_name):
print("NEW MP4 EXISTS")
check_output('ffmpeg -itsoffset -4 -i '+file_name+' -vcodec mjpeg -vframes 1 -an -f rawvideo -s '+str(props['width'])+'x'+str(props['height'])+' '+thumbnail_file_name, shell=True)
thumbnail_blob = bucket.blob(os.path.dirname(name)+'/thumbnail.jpg')
thumbnail_blob.upload_from_filename(thumbnail_file_name)
else:
print("MP4 not created")
print("uploaded")
else :
print("Not a Video")
Requirement.txt
google-cloud-storage
get-video-properties

Related

How to find where an S3 multipart upload is failing in Python?

I am implementing a cron job that will upload a large daily backup file to an S3 Bucket. It works most of the time, but every once in a while, I will check the bucket, and the file size is significantly smaller than the actual size.
It should be roughly 50GB, but the last time it happened, it showed 34GB. My main problem is that I am unsure of what error to try/catch.
I am still learning Python as I go, so bare with me.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
bucket = "my-s3-backup"
s3 = boto3.resource('s3')
# Grabbing the last file, and removing the full path from the string
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
# Multipart upload function
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload))
# Custom Slack notification to inform completion
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
# Custom Slack notification to inform of failure
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
if __name__ == '__main__':
multi_part_upload()
If the script is not "failing," but it's not uploading the complete file size, what exception am I trying to catch here? Should I log output somewhere?
I'm looking through the Botocore Exceptions documentation. I'm just unsure of what to try/catch with this.
For reference, here is the file size difference:
aws s3 ls --summarize --human-readable --recursive s3://my-s3-backup/physical_db_backups/
2022-05-07 14:31:28 50.7 GiB physical_db_backups/xb_202205070101.xb.zst
2022-05-08 12:48:07 50.8 GiB physical_db_backups/xb_202205080101.xb.zst
2022-05-09 01:30:04 34.2 GiB physical_db_backups/xb_202205090101.xb.zst <--- WRONG
Alright, since I was an idiot and didn't realize the file had not completed yet, I made a couple of changes.
I edited the cron to start later.
I have created logic to determine if the backup script is running.
I may incorporate additional checks to make sure the file exists, but for now this is a working POC that has been tested.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
from time import sleep
import psutil
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
import logging
bucket = "fsn-s3-backup"
s3 = boto3.resource('s3')
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
logging.basicConfig(filename='/var/log/s3-backup.log', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode='a')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload),
ExtraArgs={'ContentType': 'application/zstd'})
logger.info("Physical Backup to S3 Complete")
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
logger.error("Physical Backup to S3 Failed: " + error)
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
def checkIfProcessRunning(processName):
for proc in psutil.process_iter():
cmdline = proc.cmdline()
if processName in cmdline:
return True
return False
if __name__ == '__main__':
backuprunning = True
while backuprunning:
logger.info("Checking if backup shell script is running")
if checkIfProcessRunning('/path/to/physical_backup.sh'):
logger.info("Backup shell script still running. Sleeping for 60s")
sleep(60)
else:
backuprunning = False
logger.info("Beginning multipart upload")
multi_part_upload()

Converting docx to pdf using LibreOffice library on AWS lambda but unable to convert when trying with different font family other than times roman

Converting docx to pdf using LibreOffice library on AWS lambda but unable to convert when trying with different font family other than "times roman" .Please recommend other library instead of installing new fonts for a particular font family
#code -
import os
from io import BytesIO
import tarfile
import boto3
import subprocess
import brotli
libre_office_install_dir = '/tmp/instdir'
def load_libre_office():
if os.path.exists(libre_office_install_dir) and os.path.isdir(libre_office_install_dir):
print('We have a cached copy of LibreOffice, skipping extraction')
else:
print('No cached copy of LibreOffice, extracting tar stream from Brotli file.')
buffer = BytesIO()
with open('/opt/lo.tar.br', 'rb') as brotli_file:
d = brotli.Decompressor()
while True:
chunk = brotli_file.read(1024)
buffer.write(d.decompress(chunk))
if len(chunk) < 1024:
break
buffer.seek(0)
print('Extracting tar stream to /tmp for caching.')
with tarfile.open(fileobj=buffer) as tar:
tar.extractall('/tmp')
print('Done caching LibreOffice!')
return f'{libre_office_install_dir}/program/soffice.bin'
def download_from_s3(bucket, key, download_path):
s3 = boto3.client("s3")
s3.download_file(bucket, key, download_path)
def upload_to_s3(file_path, bucket, key):
s3 = boto3.client("s3")
s3.upload_file(file_path, bucket, key)
def convert_word_to_pdf(soffice_path, word_file_path, output_dir):
conv_cmd = f"{soffice_path} --headless --norestore --invisible --nodefault --nofirststartwizard --nolockcheck --nologo --convert-to pdf:writer_pdf_Export --outdir {output_dir} {word_file_path}"
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
response = subprocess.run(conv_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if response.returncode != 0:
return False
return True
def lambda_handler(event, context):
bucket = "xxxx"
key = "xxxx/xxxx/xxxx/xxxx/SampleDoc.docx"
key_prefix, base_name = os.path.split(key)
download_path = f"/tmp/{base_name}"
output_dir = "/tmp"
download_from_s3(bucket, key, download_path)
soffice_path = load_libre_office()
is_converted = convert_word_to_pdf(soffice_path, download_path, output_dir)
if is_converted:
file_name, _ = os.path.splitext(base_name)
upload_to_s3(f"{output_dir}/{file_name}.pdf", bucket, f" {key_prefix}/{file_name}.pdf")
return {"response": "file converted to PDF and available at same S3 location of input key"}
else:
return {"response": "cannot convert this document to PDF"}

Best way to check the PDF file is corrupt using python

I try to check the PDF files are corrupted in windows environment and come up with following python code.
Just want to check is it the best way to check corrupted PDF files or is there any other easy way?
Note: C:\Temp\python\sample-map (1).pdf is the corrupted PDF file
Here is the sample code
import os
import subprocess
import re
from subprocess import Popen, PIPE
def checkFile(fullfile):
proc=subprocess.Popen(["file", "-b", fullfile], shell=True, stdout=PIPE, stderr=PIPE, bufsize=0)
# -b, --brief : do not prepend filenames to output lines
out, err = proc.communicate()
exitcode = proc.returncode
return exitcode, out, err
def searchFiles(dirpath):
pwdpath=os.path.dirname(os.path.realpath(__file__))
print("running path : %s" %pwdpath )
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles=os.listdir(dirpath)
for files in listfiles:
fullfile=os.path.join(dirpath, files)
if os.access(fullfile, os.R_OK):
code, out, error = checkFile(fullfile)
if str(code) !="0" or str(error, "utf-8") != "" or re.search("^(?!PDF(\s)).*", str(out,'utf-8')):
print("ERROR " + fullfile+"\n################")
else:
print("OK " + fullfile+"\n################")
else:
print("$s : File not readable" %fullfile)
else:
print("Path is not valid")
if __name__ == "__main__":
searchFiles('C:\Temp\python')
sample output :
$ "C:/Program Files (x86)/Python37-32/python.exe" c:/Users/myuser/python/check_pdf_file.py
running path : c:\Users\myuser\python
Path C:\Temp\python validation OK
OK C:\Temp\python\Induction Guide.pdf
################
ERROR C:\Temp\python\sample-map (1).pdf
################
OK C:\Temp\python\sample-map.pdf
################
I think you can use PyPDF2 module.
pip install pypdf2
The code is as follows.
from PyPDF2 import PdfFileReader
import os
def checkFile(fullfile):
with open(fullfile, 'rb') as f:
try:
pdf = PdfFileReader(f)
info = pdf.getDocumentInfo()
if info:
return True
else:
return False
except:
return False
def searchFiles(dirpath):
pwdpath = os.path.dirname(os.path.realpath(__file__))
print("running path : %s" %pwdpath )
if os.access(dirpath, os.R_OK):
print("Path %s validation OK \n" %dirpath)
listfiles = os.listdir(dirpath)
for f in listfiles:
fullfile = os.path.join(dirpath, f)
if checkFile(fullfile):
print("OK " + fullfile + "\n################")
else:
print("ERROR " + fullfile + "\n################")
else:
print("Path is not valid")
if __name__ == "__main__":
searchFiles('C:\Temp\python')
I tried to match your coding style.
I think this code can also be used on MacOS or Linux.

Read multiple files multiprocessing

I have a simple function that scans files for a special string, but as these files are on a slow remote file storage, I need to scan them parallel.
I guess I need to use multiprocessing, but I am not sure how to do that correctly.
Here is my function:
from fnmatch import fnmatch
import os
from shutil import copy
from pprint import pprint
def getFailedFile(directory_name, folder_to_write):
for file in os.listdir(directory_name):
if fnmatch(file, '*Response.txt'):
filename = directory_name + file
try:
with open(filename, 'r', encoding='utf-8') as myfile:
data = myfile.read()
if data.find('Exception') != -1:
try:
requestFile = directory_name + file.replace('Response', 'Request')
copy(requestFile, os.getcwd() + folder_to_write)
except FileNotFoundError:
print('no such file - ', requestFile)
except UnicodeDecodeError:
print('error unicode decode -', filename)
directory_name = 'some folder'
folder_to_write = 'some folder_to_write'
getFailedFile(directory_name=directory_name, folder_to_write)
Please help. Currently it takes about 4 hours due to number of files in the destination folder.
Finally figured out how to do that:
from fnmatch import fnmatch
import os
from shutil import copy
from multiprocessing import Pool
import time
import logging
def process_file(file):
directory_name = 'directory with files'
if fnmatch(file, '*Response.txt'):
filename = directory_name + file
try:
with open(filename, 'r', encoding='utf-8') as myfile:
data = myfile.read()
if data.find('xception') != -1:
try:
requestFile = directory_name + file.replace('Response', 'Request')
responseFile = directory_name + file
try:
copy(requestFile, 'directory to write')
copy(responseFile, 'directory to write')
except Exception as e:
logging.info(str(e) + '\n')
print(str(e))
except FileNotFoundError:
print('no such file - ', requestFile)
logging.info('no such file - ' + str(requestFile) + '\n')
except UnicodeDecodeError:
print('error unicode decode -', filename)
logging.info('error unicode decode -' + str(filename) + '\n')
if __name__ == '__main__':
try:
directory_name = 'directory with files'
number_of_processes = 50
logging.info('\n' + 'Number of processes - ' + str(number_of_processes))
logging.info('Directory to scan ' + directory_name)
pool = Pool(number_of_processes)
start_time = time.time()
pool.map(process_file, os.listdir(directory_name))
pool.close()
elapsed_time = time.time() - start_time
logging.info('Elapsed time - ' + str(elapsed_time / 60) + '\n')
except Exception as e:
logging.info(str(e) + '\n')
I know that the code is not so pretty, but it works 27 minutes instead of previous elapsed time.

Consume Flask video streaming with ffmpeg

I've created a python Flask video file streamer resource that works well when getting the file in a web browser but is not working when using ffmpeg.
The flask resource is:
from flask_restful import Resource, Api
class Mp4(Resource):
def get(self,grtv,data,canal,arquivo):
path = os.path.abspath('../mp4/' + str(canal) + '/' + str(data) + '/' + str(arquivo))
print(path)
def generate():
with open(path, "rb") as f:
chunk = f.read(1024)
while chunk:
yield chunk
chunk = f.read(1024)
try:
t = os.stat(path)
size = str(t.st_size)
return Response(generate(),mimetype='video/mp4',headers={"Content-Type":"video/mp4","Content-Disposition":"inline","Content-Transfer-Enconding":"binary","Content-Length":size})
except Exception as e:
result = {'result': e}
return result
The ffmpeg command is:
ffmpeg -loglevel debug -i <url> out_teste4.mp4
where url is de address of the streaming video.
The ffmpeg most important output is:
ffmpeg output screenshot 1
ffmpeg output screenshot 2
I've already tried increasing 'analyzeduration' and 'probesize' options.
ffmpeg version: 3.4.2-1~16.04.york0.2.
python version: 3.5.2.
Does anyone can help me to get the video files with ffmpeg? Suggestions can be both on changing the API or the ffmpeg command.
Thank you all!

Resources