How to use spark.DataFrameReader from Foundry Transforms - apache-spark

I have a file with non-classic formatting so I need to use the spark.DataFrameReader (spark.read.csv) on the raw file directly so that I can set the appropriate parsing configurations.
How can I do this?

You'll want to follow the methodology over here. Strongly recommend using unit test based methods to iterate on your code to recover the file contents.
Your compute function code will look like:
from transforms.api import transform, Output, Input
from transforms.verbs.dataframes import union_many
def read_files(spark_session, paths):
parsed_dfs = []
for file_name in paths:
parsed_df = spark_session.read.option("header", "true").csv(file_name)
parsed_dfs += [parsed_df]
output_df = union_many(*parsed_dfs)
return output_df
#transform(
the_output=Output("ri.foundry.main.dataset.my-awesome-output"),
the_input=Input("ri.foundry.main.dataset.my-awesome-input"),
)
def my_compute_function(the_input, the_output, ctx):
session = ctx.spark_session
input_filesystem = the_input.filesystem()
hadoop_path = input_filesystem.hadoop_path
files = input_filesystem.ls('**/*.csv.gz').map(lambda file_name: hadoop_path + file_name)
output_df = read_files(session, files)
the_output.write_dataframe(output_df)

Related

Download all of csv files of tensorboard at once

I wanted to download the data of all my runs at once in tensorboard:
But it seems there's not a way to download all of them in one click. Does anyone know any solution to this problem?
This can lead to your answer!
https://stackoverflow.com/a/73409436/11657898
that's for 1 file, but, it's ready to put into a loop
I came up with this to solve my problem. First, you'll need to run the TensorBoard on a local host and then scrape the data from the browser.
import pandas as pd
import requests
from csv import reader
import os
def URLs(fold, trial):
URLs_dict = {
'train_accuracy' : f'http://localhost:6006/data/plugin/scalars/scalars?tag=epoch_accuracy&run=fold{fold}%5C{trial}%5Cexecution0%5Ctrain&format=csv',
'val_accuracy' : f'http://localhost:6006/data/plugin/scalars/scalars?tag=epoch_accuracy&run=fold{fold}%5C{trial}%5Cexecution0%5Cvalidation&format=csv',
'val_loss' : f'http://localhost:6006/data/plugin/scalars/scalars?tag=epoch_loss&run=fold{fold}%5C{trial}%5Cexecution0%5Cvalidation&format=csv',
'train_loss' : f'http://localhost:6006/data/plugin/scalars/scalars?tag=epoch_loss&run=fold{fold}%5C{trial}%5Cexecution0%5Ctrain&format=csv'
}
return URLs_dict
def tb_data(log_dir, mode, fold, num_trials):
trials = os.listdir(log_dir)
fdf = {}
for i, trial in enumerate(trials[:num_trials]):
r = requests.get(URLs(fold, trial)[mode], allow_redirects=True)
data = r.text
data_csv = reader(data.splitlines())
data_csv = list(data_csv)
df = pd.DataFrame(data_csv)
headers = df.iloc[0]
df = pd.DataFrame(df.values[1:], columns=headers)
if i == 0:
fdf['Step'] = df['Step']
fdf[f'trial {trial}'] = df['Value']
fdf = pd.DataFrame(fdf)
return fdf
P.S: It might need a little tweaking based on a different directory.

Does creating mid-step temp variables matter? How is dynamic memory allocation done in python for string?

Sample name of file uploaded to watchdog folder: 300_processtest_20201008_092912_rx.csv
I get file path from watchdog handler as follow:
elif event.event_type == 'created':
temp = {}
print ("Received created event - %s." % event.src_path)
file = event.src_path
req_id, d_type = get_requestID(file)
temp['requestID'] = "req_id"
temp['data_type'] = d_type
with open(file) as csv_file:
csv_reader = csv.DictReader(csv_file, delimiter='|')
temp['data'] = list(csv_reader)
r = requests.post(url = UPDATE_ENDPOINT, json = json.loads(json.dumps(temp)))
def get_requestID(file_path):
split_string = file_path.rpartition("/")
file_name = split_string[2]
remove_extension = file_name.rpartition(".")
split_name = split_extension[0].rsplit("_",2)
data_type = split_name[2]
request_id = split_name[1]
print(request_id, data_type)
# Does doing above thing in one line, saves memory usage?
#split_name = file_path.rpartition("/")[2].rpartition(".")[0].rpartition("_",2)
#data_type = split_name[2]
#request_id = split_name[1]
return request_id, data_type
I am wondering which would be a better way to write the code. I am not sure how memory allocation works in python specially for string (string interning) mentioned here.
If that is true
Method 1 using temp variables takes much more memory but is easily readable.
Method 2 doing it in one line is a bit difficult to read and understand.
Initially I thought both method would require same memory ie. creating temp variables doesn't require additional memory. Does it? Makes me wonder I never really paid attention to how python works.
Take the advantage of pathlib module -
from pathlib import Path
def get_requestID(file_path):
file_path = Path(file_path)
file_name = file_path.name
split_extension = file_path.suffix
request_id1 = split_extension[0].rpartition("_")
request_id2 = file_path.rpartition("/")[2].rpartition(".")[0].rpartition("_")
return request_id2[2]

Wrapping WriteToText Within DoFn

I'm trying to wrap WriteToText within a DoFn to allow for some customization/flexibility in how I write files. Specifically, I want to write different files based on the on an argument/input (based on value provider argument). This is the code I have so far:
class WriteCustomFile(beam.DoFn):
def __init__(self,input,output):
self.input = input
self.output = output
def process(self, element):
import re
def FileVal(path):
File1Regex = re.compile(r"[^\w](testfile)[\w]+(\.csv|\.txt)$")
File2Regex = re.compile(r"[^\w](tester)[\w-]+(\.csv|\.txt)$")
PathStr = str(path)
if File1Regex.search(PathStr) != None:
return "file1"
elif File2Regex.search(PathStr) != None:
return "file2"
File1Header = "Header1,Header2,Header3,Header4,Header5"
File2Header = "Header1,Header2,Header3,Header4,Header5,Header6,Header7,Header8"
if FileVal(self.input.get()) == "file1":
yield WriteToText(self.output.get(),shard_name_template='',header=File1Header)
elif FileVal(self.input.get()) == "file2":
yield WriteToText(self.output.get(),shard_name_template='',header=File2Header)
When I call this DoFn from within the pipeline, it does not write a file. What can I do to get this DoFn to work or is there a better way to handle this?
Thank you!
Here the best thing to do is probably partition your input into multiple PCollections (either using Partition or a DoFn with multiple outputs), and write each one out separate.
More generally one can use Dynamic Destinations, but this is not yet supported for Python.

Execute multiple Steps for entries in a List in Python

I try to load a list from a txt.file and then want to execute multiple task on every single entry. Unfortunately the tasks are executed only on one entry instead of all of them.
I load the list from the txt.file with this function:
def load_dir_file():
directory = os.path.dirname(__file__)
filename = os.path.join(directory, "law_dir")
with open(filename, "r", encoding="utf-8") as fin:
dir_file = fin.readlines()
return dir_file
This is the code to execute those tasks
def create_html():
dir_lst = load_dir_file()
for dir_link_dirty in dir_lst:
dir_link = dir_link_dirty.replace('"',"").replace(",","").replace("\n","")
dir_link_code = urllib.request.urlopen(dir_link)
bs_dir_link_code = BeautifulSoup(dir_link_code, "html5lib")
h2_a_tag = bs_dir_link_code.h2.a
html_link = str(dir_link) + "/" + str(h2_a_tag["href"])
print(dir_lst)
return html_link
The txt. file looks like this now:
"https://www.gesetze-im-internet.de/ao_1977",
"https://www.gesetze-im-internet.de/bbg_2009",
"https://www.gesetze-im-internet.de/bdsg_2018"
I am new to programming and probably fail some very basic points up there. So if you want to add some recommendation how i can improve basically, I would more then appreciate it.
Based on your comment above it sounds like you want to return a list of html links not just one. To do that you need that function to build a list and have it return that list. You have a lot going on in create_html, so for illustration purposes I split that function into two: create_html_link_list and create_html_link.
def create_html_link(dir_link_dirty):
dir_link = dir_link_dirty.replace('"',"").replace(",","").replace("\n","")
dir_link_code = urllib.request.urlopen(dir_link)
bs_dir_link_code = BeautifulSoup(dir_link_code, "html5lib")
h2_a_tag = bs_dir_link_code.h2.a
html_link = str(dir_link) + "/" + str(h2_a_tag["href"])
return html_link
def create_html_link_list():
dir_lst = load_dir_file()
html_link_list = [
create_html_link(dir_link_dirty)
for dir_link_dirty in dir_lst
]
return html_link_list

python3 get nested dictionary/property from yaml file

I try to figure out how to get nested data as dictionary/property from yaml file.
The code below works if I provide the function with only one level.
example :
result = parse_yaml_file(config_yaml_file, 'section')
but fails if I try something like :
result = parse_yaml_file(yaml_file, 'section.sub-section')
or
result = parse_yaml_file(yaml_file, '[\'section\'][\'sub-section\']')
python3 code :
def parse_yaml_file(yml_file, section):
print('section : ' + section)
data_dict = {}
try:
with open(yml_file) as f:
data_dict = (yaml.load(f))
except (FileNotFoundError, IOError):
exit_with_error('Issue finding/opening ' + yml_file)
if not section:
return data_dict
else:
return data_dict.get(section)
result = parse_yaml_file(yaml_file, 'section.sub-section.property')
print(json.dumps(result, indent=4))
Is it possible to parse only on part/section of the yaml file ?
Or just retrieve one sub-section/property from the parsed result ?
I know I can get it from the dictionary like :
data_dict['section']['sub-section']['property']
but I want it to be flexible, and not hardcoded since the data to grab is provided as argument to the function.
Thanks a lot for your help.
You could try using a library to help search the parsed yaml file e.g. dpath
https://pypi.org/project/dpath/
import yaml
import dpath.util
def parse_yaml(yml_file, section):
with open(yml_file,'r') as f:
data_dict = yaml.load(f)
return dpath.util.search(data_dict,section)
parse_yaml('file.yml','section/sub-section')

Resources