fetch all files from S3 for the last N days in python - python-3.x

I need to get all files of a S3 directory using python for the last N days. I am using the below code where it fetches all the files in the directory.
folder = 'main'
subfolder = ['test','prod']
base= os.path.join(current_directory, 'project')
for i in subfolder:
bucket_list = bucket.list(prefix="{}/{}".format(folder, i))
for l in bucket_list:
keyString = str(l.key)
d = base + "/" + keyString
l.get_contents_to_filename(d)
The below code provided only the last modified file. Is there any way we can get only the files that are modified or created in the last 2 days
for i in subfolder:
bucket_list = bucket.list(prefix="{}/{}".format(folder, i))
sorted_objs = sorted(bucket_list, key=attrgetter('last_modified'))
latest = sorted_objs.pop()
print(latest)

You appear to be using an old version of boto. These days you should be using boto3.
Here is an example to get recent files using the resource method:
import boto3
from datetime import datetime, timedelta, timezone
check_timestamp = datetime.now(timezone.utc) - timedelta(days = 5)
s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket('my-bucket')
objects = bucket.objects.filter(Prefix='my-prefix/')
recent_objects = [object.key for object in objects if object.last_modified > check_timestamp]
print(recent_objects)
Here is the same code using the client method:
import boto3
from datetime import datetime, timedelta, timezone
check_timestamp = datetime.now(timezone.utc) - timedelta(days = 5)
s3_client = boto3.client('s3')
response = s3_client.list_objects_v2(Bucket='my-bucket',Prefix='my-prefix/')
recent_objects = [object['Key'] for object in response['Contents'] if object['LastModified'] > check_timestamp]
print(recent_objects)

Related

Python: Identify invalid online link for a zip file

I am trying to automate stock price data extraction from https://www.nseindia.com/. Data is stored as a zip file and url for the zip file file varies by date. If on a certain date stock market is closed eg - weekends and holidays, there would be no file/url.
I want to identify invalid links (links that dont exist) and skip to next link.
This is a valid link -
path = 'https://archives.nseindia.com/content/historical/EQUITIES/2021/MAY/cm05MAY2021bhav.csv.zip'
This is an invalid link - (as 1st May is a weekend and stock market is closed for the day)
path2 = 'https://archives.nseindia.com/content/historical/EQUITIES/2021/MAY/cm01MAY2021bhav.csv.zip'
This is what I do to extract the data
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
import pandas as pd
import datetime
start_date = datetime.date(2021, 5, 3)
end_date = datetime.date(2021, 5, 7)
delta = datetime.timedelta(days=1)
final = pd.DataFrame()
while start_date <= end_date:
print(start_date)
day = start_date.strftime('%d')
month = start_date.strftime('%b').upper()
year = start_date.strftime('%Y')
start_date += delta
path = 'https://archives.nseindia.com/content/historical/EQUITIES/' + year + '/' + month + '/cm' + day + month + year + 'bhav.csv.zip'
file = 'cm' + day + month + year + 'bhav.csv'
try:
with urlopen(path) as f:
with BytesIO(f.read()) as b, ZipFile(b) as myzipfile:
foofile = myzipfile.open(file)
df = pd.read_csv(foofile)
final.append(df)
except:
print(file + 'not there')
If the path is invalid, python is stuck and I have to restart Python. I am not able to error handle or identify invalid link while looping over multiple dates.
What I have tried so far to differentiate between valid and invalid links -
# Attempt 1
import os
os.path.exists(path)
os.path.isfile(path)
os.path.isdir(path)
os.path.islink(path)
# output is False for both Path and Path2
# Attempt 2
import validators
validators.url(path)
# output is True for both Path and Path2
# Attempt 3
import requests
site_ping = requests.get(path)
site_ping.status_code < 400
# Output for Path is True, but Python crashes/gets stuck when I run requests.get(path2) and I have to restart everytime.
Thanks for your help in advance.
As suggested by SuperStormer - adding a timeout to the request solved the issue
try:
with urlopen(zipFileURL, timeout = 5) as f:
with BytesIO(f.read()) as b, ZipFile(b) as myzipfile:
foofile = myzipfile.open(file)
df = pd.read_csv(foofile)
final.append(df)
except:
print(file + 'not there')

Unable to infer schema when loading file

below code is working file when running in pyspark shell, but its failing when execution in spark-submit master-yarn.
What wrong am I doing here?
from datetime import date, timedelta
import pandas as pd
import os, sys
startd = '20140101'
endd=str(sys.argv[1])
currd=str(sys.argv[2])
spark = SparkSession.builder.getOrCreate()
base = "s3://metadata_v1/DATE="
dstart = pd.to_datetime(startd).date()
dend = pd.to_datetime(endd).date()
s3 = []
days = [dstart + timedelta(days=x) for x in range((dend - dstart).days + 1) if (dstart + timedelta(days=x)).weekday() == 5]
for i in days:
s3.append(base + i.strftime('%Y-%m-%d'))
data = spark.read.option("header", "True").option("delimiter", "|").option("basePath","s3://metadata_v1/").csv(s3)
Error:
pyspark.sql.utils.AnalysisException: u'Unable to infer schema for CSV. It must be specified manually.;'
It could happen when some of your csv files contain header row, which some columns can't be loaded when trying to convert the data types for some columns. You can try to remove the header row in each CSV file before reading into dataframe.

Downloaded GZ files is showing 0 byte

I am using OCI Python SDK and when i am trying to download an object (from an OCI bucket) which is GZ format it is getting downloaded but the file size is zero byte. Attaching the code
Any help is much appriciable.
import os
import oci
import io
import sys
reporting_namespace = 'xygabcdef'
prefix_file = "abc/xyz"
# Update these values
destination_path = 'downloaded_reports'
# Make a directory to receive reports
if not os.path.exists(destination_path):
os.mkdir(destination_path)
# Get the list of reports
config = oci.config.from_file(oci.config.DEFAULT_LOCATION, oci.config.DEFAULT_PROFILE)
reporting_bucket = sys.argv[1]
object_storage = oci.object_storage.ObjectStorageClient(config)
report_bucket_objects = object_storage.list_objects(reporting_namespace, reporting_bucket, prefix=prefix_file)
#def download_audit():
for o in report_bucket_objects.data.objects:
print('Found file ' + o.name)
object_details = object_storage.get_object(reporting_namespace, reporting_bucket, o.name)
print (object_details)
filename = o.name.rsplit('/', 1)[-1]
with open(destination_path + '/' + filename, 'wb') as f:
for chunk in object_details.data.raw.stream(1024 * 1024, decode_content=False):
f.write(chunk)
Please see the example here. Does this work for you? Namely:
get_obj = object_storage.get_object(namespace, bucket_name, example_file_object_name)
with open('example_file_retrieved', 'wb') as f:
for chunk in get_obj.data.raw.stream(1024 * 1024, decode_content=False):
f.write(chunk)
In your example destintation_path seems to be undefined, and seems to have a typo (destintation -> destination). Could this be the problem?
Lastly, what does object_details report the file size / content-length as? It could be that the file size of the object in Object Storage is itself 0 bytes.
the .content from the .data of get_object should give you the file data (binary or text/josn/...), so here is a modified version of your code:
import os
import sys
import oci
reporting_namespace = 'xygabcdef'
prefix_file = "abc/xyz"
# Update these values
destination_path = 'downloaded_reports'
# Get the list of reports
config = oci.config.from_file(oci.config.DEFAULT_LOCATION, oci.config.DEFAULT_PROFILE)
reporting_bucket = sys.argv[1]
object_storage = oci.object_storage.ObjectStorageClient(config)
objects = object_storage.list_objects(reporting_namespace, reporting_bucket, prefix=prefix_file).data
# def download_audit():
for obj in objects:
print('Found file ' + obj.name)
object_response = object_storage.get_object(reporting_namespace, reporting_bucket, obj.name).data
print(object_response)
file_path = os.path.join(destination_path, obj.name)
# Make sure parent dirs up to the file level are created
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, 'wb') as file:
file.write(object_response.content)

Rename by Appending a prefix to a file name

I would appreciate if someone could give me a hint. I have to rename a batch of files by adding a prefix (date) to the file name, so files are organized in ordered manner in the folder: from older to newer.
The date itself contained inside of the file. Therefore, my script has to open the file, find the date and use it as a "prefix" to add to the file name.
from datetime import datetime
import re
import os
file = open('blog_entry.txt', 'r', encoding='utf-8')
source_code = file.read()
<...>
# convert the date:
date = datetime.strptime(date_only, "%d-%b-%Y")
new_date = date.strftime('%Y_%m_%d')
The new_date variable should be used as a "prefix", so the new file name looks like "yyyy_mm_dd blog_entry.txt"
I cannot wrap my head around how to generate a "new name" using this prefix, so I can apply os.rename(old_name, new_name) command to the file. apply
Here is one way, using string concatenation to build the new filename you want:
from datetime import datetime
import re
import os
file = open('blog_entry.txt', 'r', encoding='utf-8')
source_code = file.read()
# read the date from the file contents
date = datetime.strptime(date_only, "%d-%b-%Y")
new_date = date.strftime('%Y_%m_%d')
path = "/path/to/your/file/"
os.rename(path + 'blog_entry.txt', path + new_date + ' ' + 'blog_entry.txt')

How do i sort a text file by column numerically?

from lxml import html
import operator
import discord
import yaml
import csv
raw_json =
requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = (stuff[i]['Last'])
name1 = (stuff[i]['MarketName'])
name = name1.replace("BTC-", "")
prev = (stuff[i]['PrevDay'])
diff = price - prev
change = round(((price - prev) / price) * 100, 2)
final = ('{0},{1}'.format(name,change))
new.append(final)
butFirst = new[0:]
this1 = ("\n".join(butFirst))
text_file = open("Sort.txt", "w")
text_file.write(this1)
text_file.close()
Im having problems sorting this output in second column..
I get base 10 errors.. integer errors etc.. i think the problem
is how the number is stored but i cant figure it out.
output looks like this>
1ST,-5.94
2GIVE,3.45
ABY,2.44
ADA,0.0
ADT,-4.87
ADX,-13.09
AEON,-2.86
AGRS,-2.0
You should avoid changing your data to text earlier than you need to. If you operate with a list of dictionaries it's very easy to sort the list.
import json
import csv
import requests
raw_json = requests.get('https://bittrex.com/api/v1.1/public/getmarketsummaries').text
json_dict = json.loads(raw_json)
stuff = json_dict["result"]
new = []
for i in range(0,197):
price = float(stuff[i]['Last'])
prev = float(stuff[i]['PrevDay'])
# Use dictionary to hold the data
d = {
'name' : stuff[i]['MarketName'].replace("BTC-", ""),
'change' : round(((price - prev) / price) * 100, 2)
}
new.append(d)
# The actual sorting part, sorting by change
sorted_list = sorted(new, key=lambda k: k['change'])
# Writing the dictionaries to file
with open("Sort.txt", "w") as text_file:
dict_writer = csv.DictWriter(text_file, sorted_list[0].keys())
# include the line below, if you want headers
# dict_writer.writeheader()
dict_writer.writerows(sorted_list)

Resources