Why can't I split files when generating some TFrecord files? - python-3.x

Why can't I split files when generating some TFrecords files?
I'm doing some job predicting protein stuctures. As you may know, one protein molecule might have different strands. So I need to split the list of the atoms into different TFrecords by the strand name.
The problem is, this code ended up by generating several TFrecords with nothing written. All blank.
Or, is there a method to split the strands while training my module? Then I could ignore this problem and put the strand name in the TFrecords as a feature.
'''
with all module imported and no errors raised
'''
def generate_TFrecord(intPosition, endPosition, path):
CrtS = x #x is the name of the current strand
path = path + CrtS
writer = tf.io.TFRecordWriter('%s.tfrecord' %path)
for i in range(intPosition, endPosition):
if identifyCoreCarbon(i):
vectros = getVectors(i)
features = {}
'''
feeding this dict
'''
tf_features = tf.train.Features(feature = features)
tf_example = tf.train.Example(features = tf_features)
tf_serialized = tf_example.SerializeToString()
writer.write(tf_serialized)
'''
if checkStrand(i) == False:
writer.write(tf_serialized)
intPosition = i
'''
writer.close()
'''
strand_index is a list of all the startpoint of a single strand
'''
for loop in strand_index:
generate_TFrecord(loop, endPosition, path)
'''
________division___________
This code below works, but only generate a single tfrecord containing all the atom imformations.
writer = tf.io.TFRecordWriter('%s.tfrecord' %path)
for i in range(0, endPosition):
if identifyCoreCarbon(i):
vectros = getVectors(i)
features = {}
'''
feeing features
'''
tf_features = tf.train.Features(feature = features)
tf_example = tf.train.Example(features = tf_features)
tf_serialized = tf_example.SerializeToString()
writer.write(tf_serialized)
writer.close()
'''

Related

Method reads properly but the written text file only has 1 line. Is \n not working?

The goal is to extract specific data from a text file under a folder
then write that data into another file under different folder
The extraction part works, save to variables and can even print them
The rises when you try to write them to a file
The file is empty
Need to write in this format
{self.title};;;{self.author};;;{self.release_date};;;
{self.last_update_date};;;{self.language};;;{self.producer};;;{self.book_path}
# This class includes all the operations related to a book
class Operation:
"""
Need to include these class variables
book_title_list (List of all books titles such as “[title1, title2, title3, …]”)
book_info_dict = “{title1: obj1, title2:obj2, title3:obj3…….}”)
"""
book_folder_path = './data/books_data/'
book_info_path = './data/result_data/books.txt'
def extract_book_info(self):
directory_files = os.listdir(self.book_folder_path) # Stores the .txt files under books_data folder
try:
for i in directory_files:
with open(f'{self.book_folder_path}/{i}', 'r', encoding='utf8') as f:
f_contents = f.readlines()
f_line_free = list(map(lambda x: x.strip(), f_contents))
f_lists = f_line_free[10:22] # Slicing only the required elements of the list
"""
Extracting only the necessary part and storing them
under proper variables
"""
title = f_lists[0]
author = f_lists[2]
release_date = f_lists[4]
last_update_date = f_lists[5]
language = f_lists[7]
producer = f_lists[11]
"""
Extracting the desired values
"""
title_data = title[7:]
author_data = author[8:]
release_date_data = release_date[14:]
last_update_date_data = last_update_date[24:-1]
language_data = language[10:]
producer_data = producer[13:]
print(title_data)
with open(self.book_info_path, 'w', encoding="utf8") as wf:
wf.write(f'{title_data};;;{author_data};;;{release_date_data};;;'
f'{last_update_date_data};;;{language_data};;;{producer_data};;;{self.book_info_path}\n')
return True
except FileNotFoundError:
return False
except Exception:
return False

Working with large multiple datasets where each dataset contains multiple values - Pytorch

I'm training a Neural Network and have overall > 15GB of data inside a folder, the folder has multiple pickle files, and each file contains two lists that each holds multiple values.
This looks like the following:
dataset_folder:\
file.pickle
file_2.pickle
...
...
file_n.pickle
Each file_*.pickle contains a variable length list (list x and list y).
How to load all the data to train the model without having memory issue?
By implementing the custom dataset class provided from Pytorch, we need to implement three methods so pytorch loader can work with your data
__len__
__getitem__
__init__
Let's go through how to implement each one of them seperatly.
__init__
def __init__(self):
# Original Data has the following format
"""
dict_object =
{
"x":[],
"y":[]
}
"""
DIRECTORY = "data/raw"
self.dataset_file_name = os.listdir(DIRECTORY)
self.dataset_file_name_index = 0
self.dataset_length =0
self.prefix_sum_idx = list()
# Loop over each file and calculate the length of overall dataset
# you might need to check if file_name is file
for file_name in os.listdir(DIRECTORY):
with (open(f'{DIRECTORY}/{file_name}', "rb")) as openfile:
dict_object = pickle.load(openfile)
curr_page_sum = len(dict_object["x"]) + len(dict_object["y"])
self.prefix_sum_idx.append(curr_page_sum)
self.dataset_length += curr_page_sum
# prefix sum so we have an idea of where each index appeared in which file.
for i in range (1,len(self.prefix_sum_idx)):
self.prefix_sum_idx[i] = self.prefix_sum_idx[i] + self.prefix_sum_idx[i-1]
assert self.prefix_sum_idx[-1] == self.dataset_length
self.x = []
self.y = []
As you can see above, the main idea is to use prefix sum to "treat" all the dataset as once, so the logic is whenever you need to get access to a specific index later, you simply look into prefix_sum_idx to see this where this idx appear.
In the image above, let's say we need to access the index 150. Thanks to prefix sum, we are now able to know that 150 exist in the second .pickle file. Still we need a fast mechanism to know where that idx exist in the prefix_sum_idx. This will be explained in the __getitem__
__getitem__
def read_pickle_file(self, idx):
file_name = self.dataset_file_name[idx]
dict_object = dict()
with (open(f'{YOUR_DIRECTORY}/{file_name}', "rb")) as openfile:
dict_object = pickle.load(openfile)
self.x = dict_object['x']
self.y = #some logic here
......
# Some logic here....
def __getitem__(self,idx):
# Similar to C++ std::upper_bound - O(log n)
temp = bisect.bisect_right(self.prefix_sum_idx, idx)
self.read_pickle_file(temp)
local_idx = idx - self.prefix_sum_idx[temp]
return self.x[local_idx],self.y[local_idx]
check bisect_right() docs for details on how it works, but simply it returns the rightmost place in the sorted list to insert the given element and keep it sorted. In our approach, we're interested only in the following question, "which file should I access in order to get the appropriate data". More importantly, it does so in O(log n)
__len__
def __len__(self):
return self.dataset_length
In order to get the length of our dataset, we loop through each file in and accumulate the results as shown in __init__.
The full code sample goes like this:
import pickle
import torch
import torch.nn as nn
import numpy
import os
import bisect
from torch.utils.data import Dataset, DataLoader
from src.data.make_dataset import main
from torch.nn import functional as F
class dataset(Dataset):
def __init__(self):
# Original Data has the following format
"""
dict_object =
{
"x":[],
"y":[]
}
"""
DIRECTORY = "data/raw"
self.dataset_file_name = os.listdir(DIRECTORY)
self.dataset_file_name_index = 0
self.dataset_length =0
self.prefix_sum_idx = list()
# Loop over each file and calculate the length of overall dataset
# you might need to check if file_name is file
for file_name in os.listdir(DIRECTORY):
with (open(f'{DIRECTORY}/{file_name}', "rb")) as openfile:
dict_object = pickle.load(openfile)
curr_page_sum = len(dict_object["x"]) + len(dict_object["y"])
self.prefix_sum_idx.append(curr_page_sum)
self.dataset_length += curr_page_sum
# prefix sum so we have an idea of where each index appeared in which file.
for i in range (1,len(self.prefix_sum_idx)):
self.prefix_sum_idx[i] = self.prefix_sum_idx[i] + self.prefix_sum_idx[i-1]
assert self.prefix_sum_idx[-1] == self.dataset_length
self.x = []
self.y = []
def read_pickle_file(self, idx):
file_name = self.dataset_file_name[idx]
dict_object = dict()
with (open(f'{YOUR_DIRECTORY}/{file_name}', "rb")) as openfile:
dict_object = pickle.load(openfile)
self.x = dict_object['x']
self.y = #some logic here
......
# Some logic here....
def __getitem__(self,idx):
# Similar to C++ std::upper_bound - O(log n)
temp = bisect.bisect_right(self.prefix_sum_idx, idx)
self.read_pickle_file(temp)
local_idx = idx - self.prefix_sum_idx[temp]
return self.x[local_idx],self.y[local_idx]
def __len__(self):
return self.dataset_length
large_dataset = dataset()
train_size = int (0.8 * len(large_dataset))
validation_size = len(large_dataset) - train_size
train_dataset, validation_dataset = torch.utils.data.random_split(large_dataset, [train_size, validation_size])
validation_loader = DataLoader(validation_dataset, batch_size=64, num_workers=4, shuffle=False)
train_loader = DataLoader(train_dataset,batch_size=64, num_workers=4,shuffle=False)

Simple prediction from frozen .pb saved model

I try for days to use tf exported .pb file model for prediction. The model was generated with bestExporter function as follows :
features_specs = tf.feature_column.make_parse_example_spec(serving_features)
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec=features_specs,default_batch_size=None)
exporter[n] = tf.estimator.BestExporter(name="best_exporter", serving_input_receiver_fn=serving_input_receiver_fn,event_file_pattern='eval/*.tfevents.*',exports_to_keep=1)
if train_params["use_early_stop"] == True:
hookModel[n] = tf.estimator.experimental.stop_if_no_decrease_hook(model[n], metric_name='average_loss', max_steps_without_decrease=train_params["early_stop_max_steps_without_decrease"], min_steps=train_params["early_stop_min_steps"],run_every_secs=train_params["early_stop_run_every_secs"], run_every_steps=train_params["early_stop_run_every_steps"],)
else:hookModel[n] = None
train_spec[n] = tf.estimator.TrainSpec(input_fn=input_fn_["train"+m],hooks=[hookModel[n]])
eval_spec[n] = tf.estimator.EvalSpec(input_fn=input_fn_["test"+m],start_delay_secs = train_params["eval_specs_start_delay_secs"],throttle_secs = train_params["eval_specs_throttle_secs"],exporters=[exporter[n]])
tf.estimator.train_and_evaluate(model[n], train_spec[n], eval_spec[n])
I think in this way input dict names are referenced...
I successfully load the model with :
model_[model_stage+"_"+model_type] = tf.saved_model.load(model_path)
but i don't know how correctly pass my features dictionnary in the model_XX['prediction'](example) wrapped function.
I saw this topic but didn't help : TensorFlow v2: Replacement for tf.contrib.predictor.from_saved_model
There's no equivalent of old tf.contrib.predictor.from_saved_model i used before...
Thanks for answer.
I found the solution to pass a dict in wrapped model. This is a slightly modified synthesis of these given solutions with modifications for TF2-4/Python 3.7 :
TensorFlow v2: Replacement for tf.contrib.predictor.from_saved_model
https://www.programcreek.com/python/example/90440/tensorflow.Example
Second is particulary complete and shows a lot of cases.
So :
my_dict = {"feature_1" : str(something), "feature_2" : int(an_int), , "feature_3" : float(a_float), ...}
# Load the model
my_model = tf.saved_model.load(model_path)
# Creates a serialized example from dict
def create_serialized_example(name_to_values):
example = tf.train.Example()
for name, values in name_to_values.items():
feature = example.features.feature[name]
if isinstance(values, str):
values = values.encode() # Modified because in new tf versions strings have to be encoded
add = feature.bytes_list.value.extend
elif isinstance(values, float):
add = feature.float_list.value.extend # Modified : float_list instead of float_32 in TF 2
elif isinstance(values, int):
add = feature.int64_list.value.extend
else:
raise AssertionError('Unsupported type: %s' % type(values[0]))
add([values]) # Modified : have to be a list, not variable
return example.SerializeToString()
# Predict function
pred = my_model.signatures["predict"] (examples=tf.constant([create_serialized_example(mydict)]))

how to I load a pre-batched dataset on pytorch

I have a huge dataset that cannot be stored in memory so I prebatched it several files how do I make my dataset and data loader class such that load one bath at a time.
All the files have the same base name and a unique batch number an
Example file would be called o3_batch_1.hdf5 or o3_batch_2.hdf5 the
Largest batch number is o3_batch_102.hdf5
here is what I have tried so far:
would it work?
length would be the total length of the data.
batchNum would be the non-unique number at the end of the file.
base is the common name shared by the file.
class Data(Dataset):
# Constructor
def __init__(self, base, batchNum, length):
name = base + str(batchNum)
with h5py.File(name, "r") as f:
puzz = np.array(f.get('puzzle'))
sol = np.array(f.get('Sol'))
self.puzz = torch.from_numpy(puzz)
self.sol = torch.from_numpy(sol)
self.len = length
# Getter
def __getitem__(self, batchNum, index):
return self.puzz[index], self.sol[index]
# Get length
def __len__(self):
return self.len
I think you can iterate over the Index array, and you can get your data through iteration.
Suppose your file is organized in the following manner
/yourFileDir
o3_batch_1.hdf5
o3_batch_2.hdf5
...
o3_batch_102.hdf5
And your batch Index is 0,1,2,...,102
h5_dir = '/yourFileDir'
for Index in range(103):
with h5py.File(h5_dir + 'o3_batch_{}'.format(Index), 'r') as f:
puzz = np.array(f['puzzle'])
sol = np.array(f['Sol']) # this depends on how you save your data

Split dataset based on file names in pytorch Dataset

Is there a way to divide the dataset into training and testing based on the filenames. I have a folder containing two folders: input and output. Input folder has the images and output are the labels for that image. The file names in the input folder are something like input01_train.png and input01_test.png like shown below.
Dataset
/ \
Input Output
| |
input01_train.png output01_train.png
. .
. .
input01_test.png output01_test.png
The code I have only divides the dataset into inputs and labels not test and train.
class CancerDataset(Dataset):
def __init__(self, dataset_folder):#,label_folder):
self.dataset_folder = torchvision.datasets.ImageFolder(dataset_folder ,transform = transforms.Compose([transforms.Resize(512),transforms.ToTensor()]))
self.label_folder = torchvision.datasets.ImageFolder(dataset_folder ,transform = transforms.Compose([transforms.Resize(512),transforms.ToTensor()]))
def __getitem__(self,index):
img = self.dataset_folder[index]
label = self.label_folder[index]
return img,label
def __len__(self):
return len(self.dataset_folder)
trainset = CancerDataset(dataset_folder = '/content/drive/My Drive/cancer_data/')
trainsetloader = DataLoader(trainset,batch_size = 1, shuffle = True,num_workers = 0,pin_memory = True)
I would like to be able to divide the train and test set by their names if that is possible .
You could load the images yourself in __getitem__, selecting only those that contain '_train.png' or '_test.png'.
class CancerDataset(Dataset):
def __init__(self, datafolder, datatype='train', transform = transforms.Compose([transforms.Resize(512),transforms.ToTensor()]):
self.datafolder = datafolder
self.image_files_list = [s for s in os.listdir(datafolder) if
'_%s.png' % datatype in s]
# Same for the labels files
self.label_files_list = ...
self.transform = transform
def __len__(self):
return len(self.image_files_list)
def __getitem__(self, idx):
img_name = os.path.join(self.datafolder,
self.image_files_list[idx])
image = Image.open(img_name)
image = self.transform(image)
# Same for the labels files
label = .... # Load in etc
label = self.transform(label)
return image, label
Now you could make two datasets (trainset and testset).
trainset = CancerDataset(dataset_folder = '/content/drive/My Drive/cancer_data/', datatype='train')
testset = CancerDataset(dataset_folder = '/content/drive/My Drive/cancer_data/', datatype='test')

Resources