Download all of csv files of tensorboard at once - python-3.x

I wanted to download the data of all my runs at once in tensorboard:
But it seems there's not a way to download all of them in one click. Does anyone know any solution to this problem?

This can lead to your answer!
https://stackoverflow.com/a/73409436/11657898
that's for 1 file, but, it's ready to put into a loop

I came up with this to solve my problem. First, you'll need to run the TensorBoard on a local host and then scrape the data from the browser.
import pandas as pd
import requests
from csv import reader
import os
def URLs(fold, trial):
URLs_dict = {
'train_accuracy' : f'http://localhost:6006/data/plugin/scalars/scalars?tag=epoch_accuracy&run=fold{fold}%5C{trial}%5Cexecution0%5Ctrain&format=csv',
'val_accuracy' : f'http://localhost:6006/data/plugin/scalars/scalars?tag=epoch_accuracy&run=fold{fold}%5C{trial}%5Cexecution0%5Cvalidation&format=csv',
'val_loss' : f'http://localhost:6006/data/plugin/scalars/scalars?tag=epoch_loss&run=fold{fold}%5C{trial}%5Cexecution0%5Cvalidation&format=csv',
'train_loss' : f'http://localhost:6006/data/plugin/scalars/scalars?tag=epoch_loss&run=fold{fold}%5C{trial}%5Cexecution0%5Ctrain&format=csv'
}
return URLs_dict
def tb_data(log_dir, mode, fold, num_trials):
trials = os.listdir(log_dir)
fdf = {}
for i, trial in enumerate(trials[:num_trials]):
r = requests.get(URLs(fold, trial)[mode], allow_redirects=True)
data = r.text
data_csv = reader(data.splitlines())
data_csv = list(data_csv)
df = pd.DataFrame(data_csv)
headers = df.iloc[0]
df = pd.DataFrame(df.values[1:], columns=headers)
if i == 0:
fdf['Step'] = df['Step']
fdf[f'trial {trial}'] = df['Value']
fdf = pd.DataFrame(fdf)
return fdf
P.S: It might need a little tweaking based on a different directory.

Related

How to get a download link which requires checkboxes checking in additional dialog box

I want to download the last publicly available file from https://sam.gov/data-services/Exclusions/Public%20V2?privacy=Public
while trying to download manually, the real download links look like:
https://falextracts.s3.amazonaws.com/Exclusions/Public%20V2/SAM_Exclusions_Public_Extract_V2_22150.ZIP?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220530T143743Z&X-Amz-SignedHeaders=host&X-Amz-Expires=2699&X-Amz-Credential=AKIAY3LPYEEXWOQWHCIY%2F20220530%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=3eca59f75a4e1f6aa59fc810da8f391f1ebfd8ca5a804d56b79c3eb9c4d82e32
My function gets only initial link, which refers to the real link:
import json
import requests
from operator import itemgetter
files_url = 'https://sam.gov/api/prod/fileextractservices/v1/api/listfiles?random=1653676394983&domain=Exclusions/Public%20V2&privacy=Public'
def get_file():
response = requests.get(files_url, stream=True)
links_resp = json.loads(response.text)
links_dicts = [d for d in links_resp['_embedded']['customS3ObjectSummaryList'] if d['displayKey'].count('SAM_Exclus')]
sorted_links = sorted(links_dicts, key=itemgetter('dateModified'), reverse=True)
return sorted_links[0]['_links']['self']['href']
get_file()
Result:
'https://s3.amazonaws.com/falextracts/Exclusions/Public V2/SAM_Exclusions_Public_Extract_V2_22150.ZIP'
But by following the above link, I get Access denied
So I will appreciate any hints on how to get real download links
I've edited your code as much as possible so you can understand. The requests library can convert it to json itself.
imports that are not at the beginning of the code do not look very good for reading...
import requests as req
from operator import itemgetter
files_url = "https://sam.gov/api/prod/fileextractservices/v1/api/listfiles?random=1653676394983&domain=Exclusions/Public%20V2&privacy=Public"
down_url = "https://sam.gov/api/prod/fileextractservices/v1/api/download/Exclusions/Public%20V2/{}?privacy=Public"
def get_file():
response = req.get(files_url, stream=True).json()
links_dicts = [d for d in response["_embedded"]["customS3ObjectSummaryList"]]
sorted_links = sorted(links_dicts, key=itemgetter('dateModified'), reverse=True)
key = sorted_links[0]['displayKey']
down = req.get(down_url.format(key))
if not down.status_code == 200:
return False
print(key)
open(key, 'wb').write(down.content)
return True
get_file()

boto3 read bucket files using concurrency method

I am trying to read bucket files without to saving them as a file:
import boto3
import botocore
from io import StringIO
import pandas as pd
s3 = boto3.resource('s3',config=botocore.config.Config(signature_version=botocore.UNSIGNED))
bucket = self.s3.Bucket('deutsche-boerse-xetra-pds')
objects = self.bucket.objects.filter(Prefix= date)
file = pd.read_csv(StringIO(self.bucket.Object(key=object.key).get().get('Body').read().decode('utf-8')))
This code works quite well. However, I would like to use concurrency (python asyncio) to speed up the reading process. I did a search into documentation but I could only find something for the download function but not for the get function.
Do you have any suggestion?
Thanks in advance.
I found out a solution which works with multiprocessing since my final goal was to reduce the processing time.
As follow the code:
def generate_bucket():
s3_resoursce = boto3.resource('s3',config=botocore.config.Config(signature_version=botocore.UNSIGNED))
xetra_bucket = s3_resoursce.Bucket('deutsche-boerse-xetra-pds')
return s3_resoursce, xetra_bucket
def read_csv(object):
s3local, bucket_local = self.generate_bucket()
return pd.read_csv(StringIO(bucket_local.Object(key=object).get().get('Body').read().decode('utf-8')))
def import_raw_data(date: List[str]) -> pd.DataFrame:
import multiprocessing
s3local, bucket_local2 = self.generate_bucket()
objects = [i.key for i in list(bucket_local2.objects.filter(Prefix= date[0]))]
with multiprocessing.Pool(multiprocessing.cpu_count()) as p:
df = pd.concat(p.map(self.read_csv, objects))
return df
For me it works, but I am sure that there could be the possibility to improve this code. I'm open to suggestions.

How to use spark.DataFrameReader from Foundry Transforms

I have a file with non-classic formatting so I need to use the spark.DataFrameReader (spark.read.csv) on the raw file directly so that I can set the appropriate parsing configurations.
How can I do this?
You'll want to follow the methodology over here. Strongly recommend using unit test based methods to iterate on your code to recover the file contents.
Your compute function code will look like:
from transforms.api import transform, Output, Input
from transforms.verbs.dataframes import union_many
def read_files(spark_session, paths):
parsed_dfs = []
for file_name in paths:
parsed_df = spark_session.read.option("header", "true").csv(file_name)
parsed_dfs += [parsed_df]
output_df = union_many(*parsed_dfs)
return output_df
#transform(
the_output=Output("ri.foundry.main.dataset.my-awesome-output"),
the_input=Input("ri.foundry.main.dataset.my-awesome-input"),
)
def my_compute_function(the_input, the_output, ctx):
session = ctx.spark_session
input_filesystem = the_input.filesystem()
hadoop_path = input_filesystem.hadoop_path
files = input_filesystem.ls('**/*.csv.gz').map(lambda file_name: hadoop_path + file_name)
output_df = read_files(session, files)
the_output.write_dataframe(output_df)

Saving data in Selenium

I am trying to save output data after I am running successful script in python using Selenium. But, I am not able to save result at end of my run/ script. My code is running fine, only problem is I am not able to save out to a file which can be .json, csv or text. I need serious help on this one.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
import requests
# saving data in bangkok_vendor.text
def copy_info():
with open('bangkok_vendor.text','a') as wt:
for x in script3:
wt.write(x)
wt.close()
return
contents =[]
filename = 'link_business_filter.csv'
with open(filename,'rt') as f:
data = csv.reader(f)
for row in data:
links = row[0]
contents.append(links)
for link in contents:
url_html = requests.get(link)
print(link)
browser = webdriver.Chrome('chromedriver')
open = browser.get(link)
source = browser.page_source
data = bs(source,"html.parser")
body = data.find('body')
script = body
x_path = '//*[#id="react-root"]/section/main/div'
script2 = browser.find_element_by_xpath(x_path)
script3 = script2.text
#script2.send_keys(keys.COMMAND + 't')
browser.close()
print(script3)
copy_info()
Did you try using csv.writer for csv files? Please check out the following link. hope it helps.
Save results to csv file with Python

Saving selenium results/output at run time in text file using Python

I am running a script in Python3 using Selenium. I am getting my output what I expected. Now, I want to save my output to a text, or csv or json file. When I am trying to run my script and save result to a file I am getting an Error with open('bangkok_vendor.txt','a') as wt :
TypeError: 'NoneType' object is not callable
Which means loop in the program runs only one time and does not store data in file called bangkok_vendor.txt. In normal python scraper programs it would n't have any problem storing data but this is first time I am using selenium. Can you please help me with solution thanks.
I am trying to run this script from my terminal command and output is what to save to any file format :
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
import requests
contents =[]
filename = 'link_business_filter.csv'
def copy_json():
with open("bangkok_vendor.text",'w') as wt:
for x in script2:
wt.writer(x)
wt.close()
with open(filename,'rt') as f:
data = csv.reader(f)
for row in data:
links = row[0]
contents.append(links)
for link in contents:
url_html = requests.get(link)
print(link)
browser = webdriver.Chrome('chromedriver')
open = browser.get(link)
source = browser.page_source
data = bs(source,"html.parser")
body = data.find('body')
script = body
x_path = '//*[#id="react-root"]/section/main/div'
script2 = browser.find_element_by_xpath(x_path)
script3 = script2.text
#script2.send_keys(keys.COMMAND + 't')
browser.close()
print(script3)
You need to pass script2 as a parameter for copy_json function and call it when you extract the data from the page.
Change write mode to append, otherwise the file will be reset every time you call copy_json function.
Dont overwrite built-in functions like open, otherwise you won't be able to open a file to write data once you move onto the second iteration.
I refactored your code a bit:
LINK_CSV = 'link_business_filter.csv'
SAVE_PATH = 'bangkok_vendor.txt'
def read_links():
links = []
with open(LINK_CSV) as f:
reader = csv.reader(f)
for row in reader:
links.append(row[0])
return links
def write_data(data):
with open(SAVE_PATH, mode='a') as f:
f.write(data + "\n")
if __name__ == '__main__':
browser = webdriver.Chrome('chromedriver')
links = read_links()
for link in links:
browser.get(link)
# You may have to wait a bit here
# until the page is loaded completely
html = browser.page_source
# Not sure what you're trying to do with body
# soup = BeautifulSoup(html, "html.parser")
# body = soup.find('body')
x_path = '//*[#id="react-root"]/section/main/div'
main_div = browser.find_element_by_xpath(x_path)
text = main_div.text
write_data(text)
# close browser after every link is processed
browser.quit()

Resources