Falcon and falcon-multipart + POST request for uploading files implementation - python-3.x

I'm trying to implement POST request for uploading files with Falcon framework (python).
I have used falcon-multipart in order to multipart/form-data, this allow me to retrieve my file in a cgi.FieldStorage() in which file is in binary format, but now, I need to write this file in a directory with the original extension.
This is the code I'm using.
app.py:
import falcon
from .files import Resource
from falcon_multipart.middleware import MultipartMiddleware
api = application = falcon.API(middleware=[MultipartMiddleware()])
files = Resource()
api.add_route('/files', files)
files.py:
import io
import os
import shutil
import falcon
import json
class Resource(object):
_storage_path = './uploaded_files'
def on_post(self, req, resp):
"""
POST METHOD
"""
# Retrieve file extension
ext = req.get_param('extension')
# Retrieve input_file
input_file = req.get_param('file')
# Read file as binary
raw = input_file.file.read()
# Retrieve filename
filename = input_file.filename
# Define file_path
file_path = os.path.join(self._storage_path, filename)
# Write to a temporary file to prevent incomplete files from
# being used.
temp_file_path = file_path + '~'
# Finally write the data to a temporary file
with open(temp_file_path, 'wb') as output_file:
shutil.copyfileobj(raw, output_file)
# Now that we know the file has been fully saved to disk
# move it into place.
os.rename(temp_file_path, file_path)
resp.status = falcon.HTTP_201

I had to study cgi
cgi - File upload
cgi - Big file upload
This is the implementation I used:
def on_post(self, req, resp):
"""
POST METHOD
"""
# Retrieve input_file
input_file = req.get_param('file')
# Test if the file was uploaded
if input_file.filename:
# Retrieve filename
filename = input_file.filename
# Define file_path
file_path = os.path.join(self._storage_path, filename)
# Write to a temporary file to prevent incomplete files
# from being used.
temp_file_path = file_path + '~'
open(temp_file_path, 'wb').write(input_file.file.read())
# Now that we know the file has been fully saved to disk
# move it into place.
os.rename(temp_file_path, file_path)
resp.status = falcon.HTTP_201

Try this - more detail explained here
import io
import os
import uuid
import mimetypes
import falcon
import json
class Resource(object):
_CHUNK_SIZE_BYTES = 4096
def __init__(self, storage_path):
self._storage_path = storage_path
def on_post(self, req, resp):
image = req.get_param("profilePic")
# image_type = req.get_param("profilePic").type
ext = mimetypes.guess_extension(req.content_type)
filename = "{uuid}{ext}".format(uuid=uuid.uuid4(), ext=ext)
image_path = os.path.join(self._storage_path, filename)
with open(image_path, "wb") as image_file:
while True:
chunk = image.file.read(4096)
image_file.write(chunk)
if not chunk:
break
resp.status = falcon.HTTP_200
resp.location = filename
resp.body = json.dumps("{name:" + image_path + "}")
import falcon
from falcon_multipart.middleware import MultipartMiddleware
api = application = falcon.API(middleware=[MultipartMiddleware()])
images = Resource('images')
api.add_route('/images', images)`

Related

How can I make web resources avialable offline?

Their is a folder in my PC with Linux OS, which contains a website (webpages etc.). The webpages and other complimentary files in the folder use cdns to bring resources like jquery, datatables etc.
I want to make these resources offline. I know I can manually search all files for occurrence of "http" keyword, download files from these URLs keep them in folder and accordingly change source file path. But as these are too many files it seems troublesome. I want to ask is there any better and elegant way of doing so. Thanks in advance
I made a python script to do the job:
import re
import os
import aiohttp
import asyncio
import pathlib
import string
import random
import chardet
# Decode byte sequence using chardet to avoid "Type error"
def decode_bytes(byte_sequence):
result = chardet.detect(byte_sequence)
encoding = result['encoding']
return byte_sequence.decode(encoding)
VALID_URL_REGEX = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
# Downloader, I lazily have used resp.status as success criteria but it have logical issue you can also include other logic
async def download_file(session, url, local_path):
async with session.get(url, allow_redirects=True, ssl=False) as resp:
if resp.status == 200:
print("Content path is "+str(local_path))
with open(local_path, "wb") as f:
while True:
print(local_path)
chunk = await resp.content.read(4196)
if not chunk:
break
chunk = chunk.encode("utf-8")
f.write(chunk)
downloaded_urls = set()
async def process_file(file_path, session):
print("File during Read "+str(file_path))
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
contents = f.read()
try:
contents = decode_bytes(contents)
except UnicodeDecodeError as e:
# To avoid Type error
print(f"Error decoding file {file_path}: {e}")
return
urls = re.findall(VALID_URL_REGEX, contents)
try:
for url in urls:
file_name = url.split("/")[-1]
if len(file_name)==0:
continue
if url in downloaded_urls:
local_path = downloaded_urls[url]
# generating random strings to avoid same file name but different urls
res = ''.join(random.choices(string.ascii_uppercase +string.digits, k=5))
file_name=res+file_name
local_path = os.path.join("downloaded", file_name)
if not os.path.exists(local_path):
await download_file(session, url, local_path)
# To avoid redownloading
downloaded_urls.add(url)
contents = contents.replace(url, local_path)
except:
pass
print("File during write "+str(file_path))
with open(file_path, "w", encoding="utf-8", errors="ignore") as f:
f.write(contents)
async def process_directory(directory):
if not os.path.exists("downloaded"):
os.makedirs("downloaded")
conn = aiohttp.TCPConnector(limit=2200,limit_per_host=20,ttl_dns_cache=22)
async with aiohttp.ClientSession(connector=conn) as session:
tasks = []
try:
for filepath in pathlib.Path(directory).glob('**/*'):
fp=filepath.absolute()
if str(fp).endswith(".md") or str(fp).endswith(".txt"):
continue
if os.path.isfile(fp):
tasks.append(process_file(fp, session))
except:
pass
await asyncio.gather(*tasks)
if __name__ == '__main__':
directory = input("Enter root directory") asyncio.run(process_directory(directory))
I will also try "substitution" module and update answer accordingly.

Reading file from a different project gcloud -file not found error even though the file exists

I am trying to read a json file from project B in Google cloud using a service account from project A. The service account in project A is granted read role in the project B. But when I am trying to open the json file I am getting:
"FileNotFoundError: [Errno 2] No such file or directory: 'gs://x.json'. The file x.json does exist.
I checked the list of privileges (storage.objects.get ,storage.objects.list) to read a file from https://cloud.google.com/storage/docs/access-control/iam-permissions.
Any help is appreciated. Thanks.
from google.cloud import bigquery
from analytics import Clients, ClientType
from datetime import datetime, date, timedelta
from pytz import timezone
from typing import List
from pyarrow import json as pyj
import pyarrow.parquet as pq
import newlinejson as nlj
bigquery_client = Clients.get_client(ClientType.STORAGE, name='w')
write_client = Clients.get_client(ClientType.BIGQUERY, name='w')
k_client = Clients.get_client(ClientType.BIGQUERY, name='w')
bucket ='update'
file_name_prefix = "al_"
target_table = k_client.get_table("w.junk.json_table1")
def get_dates() -> List[str]:
"""
Return dates for which log files have to be checked
"""
end = date.fromisoformat(str(datetime.date(datetime.now(timezone("EST")))))
return [str(end - timedelta(days=1)), str(end)]
def get_bucket_files(bucket, file_name_prefix):
# if full_path:
path = "gs://{}/{}"
#path = "https://storage.googleapis.com/{}/{}"
return [
path.format(bucket, b.name)
for b in bigquery_client.list_blobs(bucket, prefix=file_name_prefix)
]
def get_latest_file() -> str:
"""
Get all files for the current prefix between start and end date
"""
files = []
files_json = []
for d in get_dates():
prefix = file_name_prefix + d[4:] + "-" + d[:4]
files += get_bucket_files(bucket, file_name_prefix)
for k in files:
filename = k.split('/')[-1]
if 'json' in filename:
files_json.append(k)
return max(files_json)
def pipeline():
job_config = bigquery.LoadJobConfig(
# schema=[
# bigquery.SchemaField("name", "STRING")
# ],
autodetect=True,
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
f = get_latest_file()
print(f)
table = pyj.read_json(f)
# pq.write_table(target_table, table.parquet)
# with nlj.open(f) as src:
# with nlj.open('out.json', 'w') as dst:
# for line in src:
# dst.write(line)
# k_client.load_table_from_uri(
# f, target_table, job_config=job_config
# ).result()
pipeline()
You may consider and try the below approach in listing objects in buckets:
from google.cloud import storage
my_bucket = 'your-bucket-name'
my_prefix = 'al_'
client = storage.Client()
def get_bucket_files(bucket, file_name_prefix):
for blob in client.list_blobs(bucket, prefix=file_name_prefix):
print(str(blob))
get_bucket_files(my_bucket, my_prefix)
Output:

How to parse eml file and extract meta-data informations

I have an eml file with some attachments. I want to read text content in eml file and I want to extract meta-data information like(sender, from, cc, bcc, subject). Also I want to download the attachments as well. With the help of the below code I am only able to extract information/ text content in the body of the email.
import email
from email import policy
from email.parser import BytesParser
import glob
file_list = glob.glob('*.eml') # returns list of files
with open(file_list[2], 'rb') as fp: # select a specific email file from the list
msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
print(text)
There was module name emaildata which was available for Python 2 did the job.
Extracting MetaData Informations
import email
from emaildata.metadata import MetaData
message = email.message_from_file(open('message.eml'))
extractor = MetaData(message)
data = extractor.to_dict()
print data.keys()
Extracting Attachment Information
import email
from emaildata.attachment import Attachment
message = email.message_from_file(open('message.eml'))
for content, filename, mimetype, message in Attachment.extract(message):
print filename
with open(filename, 'w') as stream:
stream.write(content)
# If message is not None then it is an instance of email.message.Message
if message:
print "The file {0} is a message with attachments.".format(filename)
But this library is now deprecated and is of now use. Is there any other library that could extract the meta-data and attachment related information?
Meta-data information could be accessed using below code in Python 3.x
from email import policy
from email.parser import BytesParser
with open(eml_file, 'rb') as fp:
msg = BytesParser(policy=policy.default).parse(fp)
print('To:', msg['to'])
print('From:', msg['from'])
print('Subject:', msg['subject'])
Remaining header informations could be accessed using msg.keys()
For downloading attachments from an eml file you can use the below code:
import sys
import os
import os.path
from collections import defaultdict
from email.parser import Parser
eml_mail = 'your eml file'
output_dir = 'mention the directory where you want the files to be download'
def parse_message(filename):
with open(filename) as f:
return Parser().parse(f)
def find_attachments(message):
"""
Return a tuple of parsed content-disposition dict, message object
for each attachment found.
"""
found = []
for part in message.walk():
if 'content-disposition' not in part:
continue
cdisp = part['content-disposition'].split(';')
cdisp = [x.strip() for x in cdisp]
if cdisp[0].lower() != 'attachment':
continue
parsed = {}
for kv in cdisp[1:]:
key, val = kv.split('=')
if val.startswith('"'):
val = val.strip('"')
elif val.startswith("'"):
val = val.strip("'")
parsed[key] = val
found.append((parsed, part))
return found
def run(eml_filename, output_dir):
msg = parse_message(eml_filename)
attachments = find_attachments(msg)
print ("Found {0} attachments...".format(len(attachments)))
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
for cdisp, part in attachments:
cdisp_filename = os.path.normpath(cdisp['filename'])
# prevent malicious crap
if os.path.isabs(cdisp_filename):
cdisp_filename = os.path.basename(cdisp_filename)
towrite = os.path.join(output_dir, cdisp_filename)
print( "Writing " + towrite)
with open(towrite, 'wb') as fp:
data = part.get_payload(decode=True)
fp.write(data)
run(eml_mail, output_dir)
Have a look at: ParsEML it bulk extracts attachments from all eml files in a directory (originally from Stephan Hügel). And i used a modified version of MeIOC to easily extract all metadata in json format; if you want i can share that to.

How to write to text file in python?

I am a beginner in using python. I have created a plain text file and have to encrypt it to output file. But I am getting an error as below and unable to write it to output file. The code is running but the output file which should be encrypted is created.
#!/usr/bin/env python3
import os
import binascii
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.backends import default_backend
import argparse
def readfile_binary(file):
with open(file, 'rb') as f:
content = f.read()
return content
def writefile_binary(file, content):
with open(file, 'wb') as f:
f.write(content)
def main():
parser = argparse.ArgumentParser(description = 'Encryption and Decryption of the file')
parser.add_argument('-in', dest = 'input', required = True)
parser.add_argument('-out', dest = 'output', required = True)
parser.add_argument('-K', dest = 'key', help = 'The key to be used for encryption must be in hex')
parser.add_argument('-iv', dest = 'iv', help = 'The inintialisation vector, must be in hex')
args = parser.parse_args()
input_content = readfile_binary(args. input)
output_content = writefile_binary(args. output)
if __name__ == "__main__":
main()
The output file should be encrypted and it should be available in the directory.
These two lines:
input_content = readfile_binary(args. input)
output_content = writefile_binary(args. output)
There should not be a space in args.input. Here is an example,
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('filename')
args = parser.parse_args()
# using type hints can help reasoning about code
def write(filename: str, content: str) -> None:
with open(filename, 'wb') as f:
f.write(str.encode(content))
# if the filename was successfully parsed from stdin
if args.filename == 'filename.txt':
print(f"args: {args.filename}")
# write to the appropriate output file
write(filename=args.filename, content="content")
You might need to correct your code's indentation. Python requires indenting code within each function definition, loop, etc.
And as eric points out, there should be no spaces after the periods in args. input and args. output. Change those to args.input and args.output instead.
So:
#!/usr/bin/env python3
import os
import binascii
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.backends import default_backend
import argparse
def readfile_binary(file):
with open(file, 'rb') as f:
content = f.read()
return content
def writefile_binary(file, content):
with open(file, 'wb') as f:
f.write(content)
def main():
parser = argparse.ArgumentParser(description = 'Encryption and Decryption of the file')
parser.add_argument('-in', dest = 'input', required = True)
parser.add_argument('-out', dest = 'output', required = True)
parser.add_argument('-K', dest = 'key', help = 'The key to be used for encryption must be in hex')
parser.add_argument('-iv', dest = 'iv', help = 'The inintialisation vector, must be in hex')
args = parser.parse_args()
input_content = readfile_binary(args.input)
output_content = writefile_binary(args.output)
if __name__ == "__main__":
main()

Trying to download ~200 files using ThreadPoolExecutor -> some files are skipped

I need to download ~200 files. If I run code below for the 1st time, the code downloads ~100 files. Then I need to run this code few more times in order to download the rest files (ie. if I run the code 2nd time - I can get +20-30 new files, if 3rd time - again +20-30, and so on). Why does this happen\how to fix? Maybe this is important - server may generate some files up to 10 sec.
import os
import concurrent.futures
import urllib.request
import shutil
def get_cities(osm_id, file_name, place_type):
file_folder = os.path.join(os.getcwd(), place_type)
file_name = file_name + '_' + place_type
file_path = os.path.join(file_folder, file_name)
if not os.path.exists(file_folder):
os.makedirs(file_folder)
if not os.path.exists(file_path):
area_id = str(3600000000 + osm_id)
url = 'http://overpass-api.de/api/interpreter?data=(node["place"="city"](area:'+area_id+'););out;'
with urllib.request.urlopen(url) as response, open(file_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
def cities_controller(place_type='cities'):
countries = Countries.objects.filter(status=1).exclude(osm_id=None)
en_group_inst = Languages.objects.filter(iso_code='en').first().group
with concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
for country in countries:
osm_id = country.osm_id
file_name = CountriesTranslations.objects.get(
country=country, lang_group=en_group_inst).common_name.lower().replace(' ', '_')
executor.submit(get_cities, osm_id, file_name, place_type)
cities_controller()

Resources