Trying to download ~200 files using ThreadPoolExecutor -> some files are skipped - multithreading

I need to download ~200 files. If I run code below for the 1st time, the code downloads ~100 files. Then I need to run this code few more times in order to download the rest files (ie. if I run the code 2nd time - I can get +20-30 new files, if 3rd time - again +20-30, and so on). Why does this happen\how to fix? Maybe this is important - server may generate some files up to 10 sec.
import os
import concurrent.futures
import urllib.request
import shutil
def get_cities(osm_id, file_name, place_type):
file_folder = os.path.join(os.getcwd(), place_type)
file_name = file_name + '_' + place_type
file_path = os.path.join(file_folder, file_name)
if not os.path.exists(file_folder):
os.makedirs(file_folder)
if not os.path.exists(file_path):
area_id = str(3600000000 + osm_id)
url = 'http://overpass-api.de/api/interpreter?data=(node["place"="city"](area:'+area_id+'););out;'
with urllib.request.urlopen(url) as response, open(file_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
def cities_controller(place_type='cities'):
countries = Countries.objects.filter(status=1).exclude(osm_id=None)
en_group_inst = Languages.objects.filter(iso_code='en').first().group
with concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
for country in countries:
osm_id = country.osm_id
file_name = CountriesTranslations.objects.get(
country=country, lang_group=en_group_inst).common_name.lower().replace(' ', '_')
executor.submit(get_cities, osm_id, file_name, place_type)
cities_controller()

Related

How can I make web resources avialable offline?

Their is a folder in my PC with Linux OS, which contains a website (webpages etc.). The webpages and other complimentary files in the folder use cdns to bring resources like jquery, datatables etc.
I want to make these resources offline. I know I can manually search all files for occurrence of "http" keyword, download files from these URLs keep them in folder and accordingly change source file path. But as these are too many files it seems troublesome. I want to ask is there any better and elegant way of doing so. Thanks in advance
I made a python script to do the job:
import re
import os
import aiohttp
import asyncio
import pathlib
import string
import random
import chardet
# Decode byte sequence using chardet to avoid "Type error"
def decode_bytes(byte_sequence):
result = chardet.detect(byte_sequence)
encoding = result['encoding']
return byte_sequence.decode(encoding)
VALID_URL_REGEX = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
# Downloader, I lazily have used resp.status as success criteria but it have logical issue you can also include other logic
async def download_file(session, url, local_path):
async with session.get(url, allow_redirects=True, ssl=False) as resp:
if resp.status == 200:
print("Content path is "+str(local_path))
with open(local_path, "wb") as f:
while True:
print(local_path)
chunk = await resp.content.read(4196)
if not chunk:
break
chunk = chunk.encode("utf-8")
f.write(chunk)
downloaded_urls = set()
async def process_file(file_path, session):
print("File during Read "+str(file_path))
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
contents = f.read()
try:
contents = decode_bytes(contents)
except UnicodeDecodeError as e:
# To avoid Type error
print(f"Error decoding file {file_path}: {e}")
return
urls = re.findall(VALID_URL_REGEX, contents)
try:
for url in urls:
file_name = url.split("/")[-1]
if len(file_name)==0:
continue
if url in downloaded_urls:
local_path = downloaded_urls[url]
# generating random strings to avoid same file name but different urls
res = ''.join(random.choices(string.ascii_uppercase +string.digits, k=5))
file_name=res+file_name
local_path = os.path.join("downloaded", file_name)
if not os.path.exists(local_path):
await download_file(session, url, local_path)
# To avoid redownloading
downloaded_urls.add(url)
contents = contents.replace(url, local_path)
except:
pass
print("File during write "+str(file_path))
with open(file_path, "w", encoding="utf-8", errors="ignore") as f:
f.write(contents)
async def process_directory(directory):
if not os.path.exists("downloaded"):
os.makedirs("downloaded")
conn = aiohttp.TCPConnector(limit=2200,limit_per_host=20,ttl_dns_cache=22)
async with aiohttp.ClientSession(connector=conn) as session:
tasks = []
try:
for filepath in pathlib.Path(directory).glob('**/*'):
fp=filepath.absolute()
if str(fp).endswith(".md") or str(fp).endswith(".txt"):
continue
if os.path.isfile(fp):
tasks.append(process_file(fp, session))
except:
pass
await asyncio.gather(*tasks)
if __name__ == '__main__':
directory = input("Enter root directory") asyncio.run(process_directory(directory))
I will also try "substitution" module and update answer accordingly.

How to set a destination for shutil.copyfileobj?

This code saves a discord image to the folder which it is in. I tried to set a destination for the save file, but I haven't found anything on the shutil website which sets the destination. I tried to put a destination in the shutil.copyfileobj brackets, but that didn't work. Also I an relatively new to coding.
This is the code:
import uuid
import requests
import shutil
from discord.ext import commands
class filesaver:
#bot.command()
async def save(ctx):
try:
url = ctx.message.attachments[0].url
except IndexError:
print("Error: No Attachments")
await ctx.send("No Attachments detected!")
else:
if url[0:26] == "https://cdn.discordapp.com":
r= requests.get(url, stream=True)
imageName = str(uuid.uuid4()) + '.jpg'
with open(imageName, 'wb') as out_file:
print('saving image: ' + imageName)
shutil.copyfileobj(r.raw, out_file)
await ctx.send(f"text")
Your imageName doesn't contain a path, so it opens in whatever is your current working directory. That's a bit unpredictable. It's also easy to fix.
from pathlib import Path
imageName = str(Path.home() / Path(str(uuid.uuid4()) + '.jpg'))
You can of course replace Path.home() with any destination path you'd prefer.

How to find where an S3 multipart upload is failing in Python?

I am implementing a cron job that will upload a large daily backup file to an S3 Bucket. It works most of the time, but every once in a while, I will check the bucket, and the file size is significantly smaller than the actual size.
It should be roughly 50GB, but the last time it happened, it showed 34GB. My main problem is that I am unsure of what error to try/catch.
I am still learning Python as I go, so bare with me.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
bucket = "my-s3-backup"
s3 = boto3.resource('s3')
# Grabbing the last file, and removing the full path from the string
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
# Multipart upload function
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload))
# Custom Slack notification to inform completion
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
# Custom Slack notification to inform of failure
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
if __name__ == '__main__':
multi_part_upload()
If the script is not "failing," but it's not uploading the complete file size, what exception am I trying to catch here? Should I log output somewhere?
I'm looking through the Botocore Exceptions documentation. I'm just unsure of what to try/catch with this.
For reference, here is the file size difference:
aws s3 ls --summarize --human-readable --recursive s3://my-s3-backup/physical_db_backups/
2022-05-07 14:31:28 50.7 GiB physical_db_backups/xb_202205070101.xb.zst
2022-05-08 12:48:07 50.8 GiB physical_db_backups/xb_202205080101.xb.zst
2022-05-09 01:30:04 34.2 GiB physical_db_backups/xb_202205090101.xb.zst <--- WRONG
Alright, since I was an idiot and didn't realize the file had not completed yet, I made a couple of changes.
I edited the cron to start later.
I have created logic to determine if the backup script is running.
I may incorporate additional checks to make sure the file exists, but for now this is a working POC that has been tested.
from progress import ProgressPercentage # class file progress.py
from slack import * # function file for Slack notifications
import random
from time import sleep
import psutil
import glob
import os
import boto3
import botocore
from boto3.s3.transfer import TransferConfig
import logging
bucket = "fsn-s3-backup"
s3 = boto3.resource('s3')
pattern = "/path/to/backup/file/xb_*"
files = list(filter(os.path.isfile, glob.glob(pattern)))
files.sort(key=lambda x: os.path.getmtime(x))
file_to_upload = files[-1]
file_name = file_to_upload.replace('/path/to/backup/file/', '')
key_path = 'physical_db_backups/' + file_name
logging.basicConfig(filename='/var/log/s3-backup.log', format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filemode='a')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def multi_part_upload():
config = TransferConfig(multipart_threshold=1024 * 25,
max_concurrency=10,
multipart_chunksize=1024 * 25,
use_threads=True)
try:
s3.meta.client.upload_file(file_to_upload, bucket, key_path, Config=config,
Callback=ProgressPercentage(file_to_upload),
ExtraArgs={'ContentType': 'application/zstd'})
logger.info("Physical Backup to S3 Complete")
sendslacksuccess("Physical Backup to S3 Complete:\n" + file_name)
except botocore.exceptions.ClientError as error:
logger.error("Physical Backup to S3 Failed: " + error)
sendslackerror("Physical Backup to S3 Failed:\n" + file_name + "\nError: " + error)
def checkIfProcessRunning(processName):
for proc in psutil.process_iter():
cmdline = proc.cmdline()
if processName in cmdline:
return True
return False
if __name__ == '__main__':
backuprunning = True
while backuprunning:
logger.info("Checking if backup shell script is running")
if checkIfProcessRunning('/path/to/physical_backup.sh'):
logger.info("Backup shell script still running. Sleeping for 60s")
sleep(60)
else:
backuprunning = False
logger.info("Beginning multipart upload")
multi_part_upload()

Dynamically create file folder based on line 1 in csv

This script downloads images, renames them based on line[0] adds a number to the end of the file name and saves them to a file folder. My goal here is to create the file folder name based on line[0] of my csv file, I'm new to python and need to download/sort 15000+ images. Any help would be appreciated! Using python 3.8.6
Note: 1 model may contain many images so the idea is to create the folder, place images for that model inside, move on to the next model, etc.
Csv file content
RHV3-484-L,https://www.fisherpaykel.com/on/demandware.static/-/Sites-fpa-master-catalog/default/dw0c85e188/product-mugs/cooking/ranges/mug/retail/RHV3-484-N-RHV3-484-L-external-mug-rs-84.png
RHV3-484-L,https://www.fisherpaykel.com/on/demandware.static/-/Sites-fpa-master-catalog/default/dwcbd711e5/inspire/caitlin-wilson-portland-dk-339-rs-84.png
RHV3-484-L,https://www.fisherpaykel.com/on/demandware.static/-/Sites-fpa-master-catalog/default/dw3702e52a/inspire/caitlin-wilson-portland-dk-385-rs-84.jpg
RHV3-484-L,https://www.fisherpaykel.com/on/demandware.static/-/Sites-fpa-master-catalog/default/dw0c85e188/product-mugs/cooking/ranges/mug/retail/RHV3-484-N-RHV3-484-L-external-mug-rs-84.png
RHV3-484-L,https://www.fisherpaykel.com/on/demandware.static/-/Sites-fpa-master-catalog/default/dwf99a5a9d/inspire/david-berridge-project-brooklyn-mw-6457-rs-84.jpg
Python script
import sys
import urllib
import urllib.request
from csv import reader
import os.path
import os
csv_filename = "images"
with open(csv_filename+".csv".format(csv_filename), 'r') as csv_file:
n = 1
for line in reader(csv_file):
if not os.path.exists("ImageID"):
os.makedirs("ImageID")
print("Image skipped for {0}".format(line[0]))
else:
if line[1] != '' and line[0] != "ImageID":
urllib.request.urlretrieve(line[1], "ImageID/" + line[0] + "-" + str(n) + ".jpg")
n += 1
print("Image saved for {0}".format(line[0]))
else:
print("No result for {0}".format(line[0]))
This seems to work as desired....
Couple comments in middle. Notably, you need to respect the .jpg or .png file. If you have file extensions that are longer (4 chars) you may need to split out the file name and then split by "."
Good Luck!
import sys
import urllib
import urllib.request
from csv import reader
import os.path
import os
csv_filename = "images.csv"
with open(csv_filename, 'r') as csv_file:
n = 1 # starting point
for line in reader(csv_file):
tgt_folder = line[0]
if not os.path.exists(tgt_folder):
os.makedirs(tgt_folder)
n = 1 # restart n if you find a NEW folder
# there should be no "else" clause here. Just test the folder name above, but don't waste a file
if line[1] != '' and line[0] != "ImageID": # not clear what this is for... ??
filename = ''.join([line[0], '-', str(n), line[1][-4:]])
destination = os.path.join(tgt_folder, filename)
urllib.request.urlretrieve(line[1], destination)
n += 1
print("Image saved for {0}".format(line[1]))
else:
print("No result for {0}".format(line[1]))

Falcon and falcon-multipart + POST request for uploading files implementation

I'm trying to implement POST request for uploading files with Falcon framework (python).
I have used falcon-multipart in order to multipart/form-data, this allow me to retrieve my file in a cgi.FieldStorage() in which file is in binary format, but now, I need to write this file in a directory with the original extension.
This is the code I'm using.
app.py:
import falcon
from .files import Resource
from falcon_multipart.middleware import MultipartMiddleware
api = application = falcon.API(middleware=[MultipartMiddleware()])
files = Resource()
api.add_route('/files', files)
files.py:
import io
import os
import shutil
import falcon
import json
class Resource(object):
_storage_path = './uploaded_files'
def on_post(self, req, resp):
"""
POST METHOD
"""
# Retrieve file extension
ext = req.get_param('extension')
# Retrieve input_file
input_file = req.get_param('file')
# Read file as binary
raw = input_file.file.read()
# Retrieve filename
filename = input_file.filename
# Define file_path
file_path = os.path.join(self._storage_path, filename)
# Write to a temporary file to prevent incomplete files from
# being used.
temp_file_path = file_path + '~'
# Finally write the data to a temporary file
with open(temp_file_path, 'wb') as output_file:
shutil.copyfileobj(raw, output_file)
# Now that we know the file has been fully saved to disk
# move it into place.
os.rename(temp_file_path, file_path)
resp.status = falcon.HTTP_201
I had to study cgi
cgi - File upload
cgi - Big file upload
This is the implementation I used:
def on_post(self, req, resp):
"""
POST METHOD
"""
# Retrieve input_file
input_file = req.get_param('file')
# Test if the file was uploaded
if input_file.filename:
# Retrieve filename
filename = input_file.filename
# Define file_path
file_path = os.path.join(self._storage_path, filename)
# Write to a temporary file to prevent incomplete files
# from being used.
temp_file_path = file_path + '~'
open(temp_file_path, 'wb').write(input_file.file.read())
# Now that we know the file has been fully saved to disk
# move it into place.
os.rename(temp_file_path, file_path)
resp.status = falcon.HTTP_201
Try this - more detail explained here
import io
import os
import uuid
import mimetypes
import falcon
import json
class Resource(object):
_CHUNK_SIZE_BYTES = 4096
def __init__(self, storage_path):
self._storage_path = storage_path
def on_post(self, req, resp):
image = req.get_param("profilePic")
# image_type = req.get_param("profilePic").type
ext = mimetypes.guess_extension(req.content_type)
filename = "{uuid}{ext}".format(uuid=uuid.uuid4(), ext=ext)
image_path = os.path.join(self._storage_path, filename)
with open(image_path, "wb") as image_file:
while True:
chunk = image.file.read(4096)
image_file.write(chunk)
if not chunk:
break
resp.status = falcon.HTTP_200
resp.location = filename
resp.body = json.dumps("{name:" + image_path + "}")
import falcon
from falcon_multipart.middleware import MultipartMiddleware
api = application = falcon.API(middleware=[MultipartMiddleware()])
images = Resource('images')
api.add_route('/images', images)`

Resources