How to implement a custom Pytorch Dataset with map() method - pytorch

Many of the Pytorch examples use the Dataset map() method. For example:
https://huggingface.co/voidful/wav2vec2-large-xlsr-53-tw-gpt
ds = load_dataset("common_voice", 'zh-TW', split="test")
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
def map_to_array(batch):
audio = batch["audio"]
batch["speech"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
batch["sampling_rate"] = audio["sampling_rate"]
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().replace("’", "'")
return batch
ds = ds.map(map_to_array)
def map_to_pred(batch):
features = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0], padding=True, return_tensors="pt")
input_values = features.input_values.to(device)
attention_mask = features.attention_mask.to(device)
with torch.no_grad():
logits = model(input_values, attention_mask=attention_mask).logits
pred_ids = torch.argmax(logits, dim=-1)
batch["predicted"] = processor.batch_decode(pred_ids)
batch["target"] = batch["sentence"]
return batch
result = ds.map(map_to_pred, batched=True, batch_size=3, remove_columns=list(ds.features.keys()))
However, implementing a custom Map style dataset only requires __len__() and __getitem__()
What is the correct way to convert a custom Dataset into one with all the useful methods needed by the examples?

Related

How to apply a function to convert the paths to arrays using cv2 in tensorflow data pipeline?

Any help will be highly appreciated
I'm trying to load two lists containing image paths and their corresponding labels. Something like this:
p0 = ['a','b',....] #paths to images .tif format
p1 = [1,2,3,......] #paths to images .tif format
labels = [0,1,1,...] #corresponding labels w.r.t both the lists
I used a tf.data in the following way:
def TFData(p_0, p_1, batch_size, labels=None, is_train=True):
dset = tf.data.Dataset.from_tensor_slices((p_0,p_1))
if labels is not None:
label = tf.data.Dataset.from_tensor_slices(labels)
AUTO = tf.data.experimental.AUTOTUNE
final_dset = tf.data.Dataset.zip((dset, label))
final_dset = final_dset.batch(batch_size, drop_remainder=is_train).prefetch(AUTO)
return final_dset
This returns:
<PrefetchDataset shapes: (((64,), (64,)), (64,)), types: ((tf.string, tf.string), tf.int32)>
My question is how to apply a function to convert the paths to arrays using cv2 as the images are .tif files? such that the result will be:
<PrefetchDataset shapes: (((64,256,256,3), (64,256,256,3)), (64,)), types: ((tf.float64, tf.float64), tf.int32)>
I'm using a dataset.map. However it's throwing error:
def to_array(p_0):
im_1 = cv2.imread(p_0,1)
#im = tfio.experimental.image.decode_tiff(paths)
im_1 = cv2.resize(im_1,(img_w,img_h)) #img_w=img_h=256
im_1 = np.asarray(im_1, dtype=np.float64)
im_1 /= 255
return im_1
def parse_fn(p_0):
[p_0,] = tf.py_function(to_array, [p_0], [tf.float64])
return p_0
def TFData(p_0, p_1, batch_size, labels=None, is_train=True):
dset_1 = tf.data.Dataset.from_tensor_slices(p_0)
dset_1 = dset_1.map(parse_fn)
dset_2 = tf.data.Dataset.from_tensor_slices(p_1)
dset_2 = dset_2.map(parse_fn)
if labels is not None:
label = tf.data.Dataset.from_tensor_slices(labels)
AUTO = tf.data.experimental.AUTOTUNE
final_dset = tf.data.Dataset.zip((dset_1, dset_2, label))
final_dset = final_dset.batch(batch_size, drop_remainder=is_train).prefetch(AUTO)
return final_dset
print(train_data) #where train_data is defined as TFData()
<PrefetchDataset shapes: ((<unknown>, <unknown>), (64,)), types: ((tf.float64, tf.float64), tf.int32)>
This throws an error:
for (t,p),l in train_data.as_numpy_iterator():
print(t)
print(p)
print(l)
print(type(t))
break
SystemError: <built-in function imread> returned NULL without setting an error
[[{{node EagerPyFunc}}]] [Op:IteratorGetNext]
Any help will be highly appreciated
I think your problem is in cv2.imread.
Have you checked outside the functions to see if it is reading and plotting the data accordingly?
Please, try with -1 instead:
im_1 = cv2.imread(p_0,-1)

getting TypeError: Expected int32, got None of type 'NoneType' instead

I have implemented sequence to sequence model with attention layer if I 300000 data points I'm not getting any error if I use all of my data points I'm getting following error model.fit
TypeError: Expected int32, got None of type 'NoneType' instead.
what would be the reason for this?
the code before model.fit is
class encoder_decoder(tf.keras.Model):
def __init__(self,embedding_size,encoder_inputs_length,output_length,vocab_size,output_vocab_size,score_fun,units):
super(encoder_decoder,self).__init__()
self.vocab_size = vocab_size
self.enc_units = units
self.embedding_size = embedding_size
self.encoder_inputs_length = encoder_inputs_length
self.output_length = output_length
self.lstm_output = 0
self.state_h = 0
self.state_c = 0
self.output_vocab_size = output_vocab_size
self.dec_units = units
self.score_fun = score_fun
self.att_units = units
self.encoder=Encoder(self.vocab_size,self.embedding_size,self.enc_units,self.encoder_inputs_length)
self.decoder = Decoder(self.output_vocab_size, self.embedding_size, self.output_length, self.dec_units ,self.score_fun ,self.att_units)
# self.dense = Dense(self.output_vocab_size,activation = "softmax")
def call(self,data):
input,output = data[0],data[1]
encoder_hidden = self.encoder.initialize_states(input.shape[0])
encoder_output,encoder_hidden,encoder_cell = self.encoder(input,encoder_hidden)
decoder_hidden = encoder_hidden
decoder_cell =encoder_cell
decoder_output = self.decoder(output,encoder_output,decoder_hidden,decoder_cell)
return decoder_output
Inside the call function I'm initializing states for the encoder where I'm getting
the number of rows from input using the following line of code
encoder_hidden = self.encoder.initialize_states(input.shape[0])
If I print input, I'm getting shape as (None,55)
That's the reason I'm getting this error.
Here my total number data points is 330614 when I use all my data I getting this
error, when I use only 330000 data points I'm getting this error,
if I print batch inside def method I'm getting shape as (64,55)
Please find my below code for creating dataset for my sequence to sequence model
the function to reprocess the data and the function to create the dataset
and a function the load the dataset
def preprocess_sentence(w):
# w = unicode_to_ascii(w.lower().strip())
w = re.sub(r"([?.!,¿])", r" \1 ", w)
w = re.sub(r'[" "]+', " ", w)
w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
w = w.strip()
w = '<start> ' + w + ' <end>'
return w
def create_dataset(path, num_examples):
lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
# lines1 = lines[330000:]
# lines = lines[0:323386]+lines1
word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines[:num_examples]]
word_pairs = [[i[0],i[1]] for i in word_pairs]
return zip(*word_pairs)
def tokenize(lang):
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
filters='')
lang_tokenizer.fit_on_texts(lang)
tensor = lang_tokenizer.texts_to_sequences(lang)
tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
return tensor, lang_tokenizer
def load_dataset(path, num_examples=None):
# creating cleaned input, output pairs
targ_lang, inp_lang = create_dataset(path, num_examples)
input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer,targ_lang,inp_lang
# Try experimenting with the size of that dataset
num_examples = None
input_tensor, target_tensor, inp_lang, targ_lang,targ_lang_text,inp_lang_text = load_dataset(path, num_examples)
# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]
max_length_targ,max_length_inp
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
the shape of datasets as follows
shape of input train (269291, 55)
shape of target train (269291, 53)
shape of input test (67323, 55)
shape of target test (67323, 53)
You can share the code block before the model.fit.
NoneType error is indicating that the final array which is passed to the model is for some reason empty. You can add print statements at previous steps to understand where along the way your array became empty.
Compare the scenario to the case where you are taking all your data points so that you can understand where the array is changing and how it is handled prior to passing it through model.fit.

Select multiple target variables in keras tensorflow

I am trying to follow a kaggle kernel for BERT implementation :
https://www.kaggle.com/hiromoon166/save-bert-fine-tuning-model
But i am not able to select target variables. I have to select multiple target variables as my y variable as it is a multi-label classification.
This is the line of code i am stuck on:
train_lines, train_labels = train_df['comment_text'].values, train_df.target.values
def convert_lines(example, max_seq_length,tokenizer):
max_seq_length -=2
all_tokens = []
longer = 0
for i in range(example.shape[0]):
tokens_a = tokenizer.tokenize(example[i])
if len(tokens_a)>max_seq_length:
tokens_a = tokens_a[:max_seq_length]
longer += 1
one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
all_tokens.append(one_token)
print(longer)
return np.array(all_tokens)
nb_epochs=1
bsz = 32
dict_path = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
tokenizer = tokenization.FullTokenizer(vocab_file=dict_path, do_lower_case=True)
print('build tokenizer done')
train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
train_df = train_df.sample(frac=0.01,random_state = 42)
#train_df['comment_text'] = train_df['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n', ' ', regex=True)
train_lines, train_labels = train_df['comment_text'].values, train_df.target.values
print('sample used',train_lines.shape)
token_input = convert_lines(train_lines,maxlen,tokenizer)
seg_input = np.zeros((token_input.shape[0],maxlen))
mask_input = np.ones((token_input.shape[0],maxlen))
print(token_input.shape)
print(seg_input.shape)
print(mask_input.shape)
print('begin training')
model3.fit([token_input, seg_input, mask_input],train_labels,batch_size=bsz,epochs=nb_epochs)
Please help me understand how to select target variables here?

How to implement TensorBoard v2 (tf.contrib.summary) with while_loop?

I want to log the loss and accuracy value evaluated inside a nested body function of a while_loop during training. This is the structure: I have a class, a method of this class builds the graph using a while_loop (build_graph()). Another method calls build_graph() and then runs the session. It works. Or it seems to work. However, I would like to use TensorBoard to check if loss and accuracy are actually improving, but I'm not able to summary those tensors. I've tried to define a tf.contrib.summary.create_file_writer('summary') and a graph and pass them to build_graph() as parameters, so that the body function can see them. I have checked the list during graph execution coming from tf.contrib.summary.all_summary_ops() and it isn't empty. However, when I open TensorBoard I get "No dashboards are active for the current data set.". Neither the graph. I am aware that tf.summary does not work in while_loop but it seems that tf.contrib.summary works.
Here is a working example
import tensorflow as tf
import sys
import datamanagement
class myNet:
def __init__(self):
self.varlist = ["x", "y"]
self.data = []
self.hsize = [10, 10]
self.batch_size = 10
self.tr_mainsteps = 1000
self.learnrate = 0.001
self.sourcedatafile = "XYfit.csv" # source file
# Dataset parameters
self.seq_params = {'dim': len(self.varlist),
'batch_size': self.batch_size,
'shuffle': True,
'filepath': self.sourcedatafile}
# Dataset from CSV file
self.dataset = datamanagement.CSVDataSet(**self.seq_params).finaldataset
# Iterator on the CSV file
self.dataiterator = self.dataset.make_initializable_iterator()
# Optimizer
self.optim = tf.train.RMSPropOptimizer(learning_rate=self.learnrate)
# Official creation of the graph
self.graph = tf.get_default_graph()
with self.graph.as_default():
# Writer creation
self.writer = tf.contrib.summary.create_file_writer('./summary')
with self.writer.as_default():
tf.contrib.summary.always_record_summaries()
def mymodel(self, Zinp, reuse=False):
# This function builds the graph of the network
with tf.variable_scope("mymod/net", reuse=reuse):
h1 = tf.layers.dense(Zinp, self.hsize[0], activation=tf.nn.leaky_relu, name='h1')
h2 = tf.layers.dense(h1, self.hsize[1], activation=tf.nn.leaky_relu, name='h2')
out = tf.layers.dense(h2, len(self.varlist), activation=None, name='final') # None means linear activation
return out
def _trainepoch(self, ind):
with self.writer.as_default():
# Real data tensor from CSV file
self.realdata = self.dataiterator.get_next()
# random input vector
self.Znoise = tf.random_uniform([self.batch_size, len(self.varlist)], minval=-1., maxval=1.)
# Model and output tensor
self.output = self.mymodel(self.Znoise, reuse=tf.AUTO_REUSE)
# Loss
self.loss = tf.losses.mean_squared_error(self.realdata, self.output)
tf.contrib.summary.scalar("loss", self.loss)
#Trainable variables
t_vars = tf.trainable_variables()
# Evaluation of the weight gradients
grad = self.optim.compute_gradients(self.loss, var_list=t_vars)
# Update weights based on gradients
return self.optim.apply_gradients(grad), tf.contrib.summary.all_summary_ops()
def _train_buildgraph(self):
def body(ind, ops):
train_up, ops = self._trainepoch(ind)
# Ensure that the update is applied before continuing.
with tf.control_dependencies([train_up]):
ind = ind + 1
return ind, ops
def cond(ind, ops):
return ind < self.tr_mainsteps
return tf.while_loop(cond, body, [tf.constant(0), [tf.Variable(False)]])
def config_run(self, trepoch=50, testNet=False):
self.tr_mainsteps = trepoch # Number of adversarial training epoch
with self.graph.as_default():
with self.writer.as_default():
tr_loop, summary_ops = self._train_buildgraph()
# Graph execution
with self.graph.as_default():
with self.writer.as_default():
with tf.Session() as sess:
sess.run(tf.initializers.global_variables())
sess.run(self.dataiterator.initializer)
tf.contrib.summary.initialize(
graph=tf.get_default_graph()
)
sess.run([summary_ops, tr_loop, summary_ops])
def main(argv):
hmodel = myNet()
hmodel.config_run()
if __name__ == "__main__":
main(sys.argv[1:])
Can someone help me?

Split dataset based on file names in pytorch Dataset

Is there a way to divide the dataset into training and testing based on the filenames. I have a folder containing two folders: input and output. Input folder has the images and output are the labels for that image. The file names in the input folder are something like input01_train.png and input01_test.png like shown below.
Dataset
/ \
Input Output
| |
input01_train.png output01_train.png
. .
. .
input01_test.png output01_test.png
The code I have only divides the dataset into inputs and labels not test and train.
class CancerDataset(Dataset):
def __init__(self, dataset_folder):#,label_folder):
self.dataset_folder = torchvision.datasets.ImageFolder(dataset_folder ,transform = transforms.Compose([transforms.Resize(512),transforms.ToTensor()]))
self.label_folder = torchvision.datasets.ImageFolder(dataset_folder ,transform = transforms.Compose([transforms.Resize(512),transforms.ToTensor()]))
def __getitem__(self,index):
img = self.dataset_folder[index]
label = self.label_folder[index]
return img,label
def __len__(self):
return len(self.dataset_folder)
trainset = CancerDataset(dataset_folder = '/content/drive/My Drive/cancer_data/')
trainsetloader = DataLoader(trainset,batch_size = 1, shuffle = True,num_workers = 0,pin_memory = True)
I would like to be able to divide the train and test set by their names if that is possible .
You could load the images yourself in __getitem__, selecting only those that contain '_train.png' or '_test.png'.
class CancerDataset(Dataset):
def __init__(self, datafolder, datatype='train', transform = transforms.Compose([transforms.Resize(512),transforms.ToTensor()]):
self.datafolder = datafolder
self.image_files_list = [s for s in os.listdir(datafolder) if
'_%s.png' % datatype in s]
# Same for the labels files
self.label_files_list = ...
self.transform = transform
def __len__(self):
return len(self.image_files_list)
def __getitem__(self, idx):
img_name = os.path.join(self.datafolder,
self.image_files_list[idx])
image = Image.open(img_name)
image = self.transform(image)
# Same for the labels files
label = .... # Load in etc
label = self.transform(label)
return image, label
Now you could make two datasets (trainset and testset).
trainset = CancerDataset(dataset_folder = '/content/drive/My Drive/cancer_data/', datatype='train')
testset = CancerDataset(dataset_folder = '/content/drive/My Drive/cancer_data/', datatype='test')

Resources