tf.GradientTape gradient() returns None - keras

I am trying to train my keras model using TensorFlow, so far I can build the model,
def Model(input_shape, num_of_layers):
num_of_layers = 5
mod = keras.models.Sequential()
mod.add(keras.layers.Dense(1, input_shape = (input_shape,)))
for i in range(num_of_layers - 1):
mod.add(keras.layers.Dense(16, activation = 'tanh'))
mod.add(keras.layers.Dense(1, activation = 'tanh'))
return mod
and loss function.
def loss(u_pred, u_true):
return tf.reduce_mean(tf.keras.losses.mean_squared_error(u_pred, u_true))
Then I create a train function to train the model.
def train(model, X, epoch = 500, lr = 1e-3):
trainable_params = [tf.Variable(model.get_weights()[i]) for i in range(len(model.get_weights()))]
loss_array = []
optim = tf.keras.optimizers.Adam(learning_rate = lr)
for i in range(epoch):
with tf.GradientTape() as g:
g.watch(trainable_params)
loss_val = loss(model(X), tf.zeros_like(X))
grad = g.gradient(loss_val, trainable_params)
...
The grad returns a vector of None when I print it. What went wrong with my train function? I have converted my model's weights and biases to tensor object using tf.Variable. Using tf.cast or tf.convert_to_tensor doesn't help either.

Related

Different training result obtained from training simple LSTM in Keras and Pytorch

I’m trying to implement my LSTM model from Keras to Pytorch, but the results in Pytorch seem really bad at the moment. The network is really simple as below.
model = Sequential()
model.add(LSTM(10, input_length=shape[1], input_dim=shape[2]))
# output shape: (1, 1)
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(1,activation="linear"))
model.compile(loss="mse", optimizer="adam")
model.summary()
And I migrate it to the Pytorch framework,
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim,bilstm=False):
super(LSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.isBi = bilstm
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True,bidirectional=bilstm).double()
# for name, param in self.lstm.named_parameters():
# if name.startswith("weight"):
# nn.init.orthogonal_(param)
# else:
# pass
self.fc1 = nn.Sequential(nn.Linear(hidden_dim, 10).double(),nn.Tanh())
self.final_layer1 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer2 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer3 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer4 = nn.Sequential(nn.Linear(10,output_dim).double())
def forward(self, x):
out, (hn, cn) = self.lstm(x)
out = out[:, -1, :]
out = self.fc1(out)
out = self.final_layer1(out)
out = self.final_layer2(out)
out = self.final_layer3(out)
out = self.final_layer4(out)
return out
The result is really bad. I was wondering if the initializing methods/activation functions used in Keras are different from the one I used in Pytorch(Keras seems to be using hard_sigmoid where Pytorch uses sigmoid?).
Would really appreciate it if somebody could help me with this problem!
UPDATED
My training code in Pytorch.
criterion = nn.MSELoss()
model = LSTM(input_dim,hidden_dim,num_layers,output_dim,bilstm)
model = model.cuda()
optimizer = optim.Adam(model.parameters(),lr=0.001)
for epoch in range(1,epoch_number+1):
model.train()
iteration = 0
for i,data in enumerate(train_loader):
dat, label = data
dat = dat.double()
label = label.double()
if torch.cuda.is_available():
dat = dat.cuda()
label = label.cuda()
else:
dat = Variable(dat)
label = Variable(label)
out = model(dat)
optimizer.zero_grad()
loss = criterion(out, label)
loss.backward()
optimizer.step()

I get something wrong when use model.train() and model.eval() on pytorch

I have prepare features and their labels as blow; I want to build a model which is constructed by transformers' encoder and then add a linear layer to predict a value. but I got some error when I use the model to predict after its training.
At first I run below code:
import torch
from torch import nn
features = torch.rand(bach_size, channels, lenght)
labels = torch.rand(batch_size)
class TransformerModel(nn.Module):
def __init__(self):
super(TransformerModel, self).__init__()
encoder_layer = nn.TransformerEncoderLayer(d_model=8, nhead=8, dropout=0.5)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, 6)
self.decoder = nn.Linear(40, 1)
def forward(self, src):
encoded = self.transformer_encoder(src.transpose(1, 0)).transpose(1, 0)
pred = self.decoder(encoded.reshape(encoded.shape[0], -1))
return pred
model = TransformerModel()
criterion = nn.MSELoss()
lr = 0.3 # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
def train():
model.train() # Turn on the train mode
optimizer.zero_grad()
output = model(features)
loss = criterion(output.view(-1, 1), labels.view(-1, 1))
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
return loss.item()
for _ in range(100):
train()
After that, I predict features by the below codes:
model.eval()
output = model(features)
I get all values of 'output' are the same, and if use 'model.train()', the 'output' seems Ok; so what is the problem? or the model was built wrong?

How to fix 'No gradients provided for any variable' error when using ctc_loss in Tensorflow

I am trying to make Baidu's Deep Speech 2 model in Tensorflow 2.0.0alpha0. I am having trouble optimizing the Tensorflow ctc_loss using a tf.GradientTape() object for calculating the gradients.
I am currently passing a tensor of shape (batch_size, max_step, feats) to my model and then passing the computed logits to the loss function. I have also tried passing a sparse tensor but this also does not work.
Here is the code for creating my model
import tensorflow as tf
class DeepSpeech2(tf.keras.Model):
def __init__(self, vocab_size, conv_filters=[11], conv_kernel_sizes=[1280], conv_strides=[2],
recur_sizes=[100], rnn_type='gru', bidirect_rnn=False, batch_norm=True,
learning_rate=1e-3, name='DeepSpeech2'):
super(DeepSpeech2, self).__init__()
self._vocab_size = vocab_size
self._conv_filters = conv_filters
self._conv_kernel_sizes = conv_kernel_sizes
self._conv_strides = conv_strides
self._recur_sizes = recur_sizes
self._rnn_type = rnn_type
self._bidirect_rnn = bidirect_rnn
self._batch_norm = batch_norm
self._learning_rate = learning_rate
self._name = name
self._conv_batch_norm = None
with tf.name_scope(self._name):
self._convolution = [tf.keras.layers.Conv1D(filters=conv_filters[i],
kernel_size=conv_kernel_sizes[i], strides=conv_strides[i],
padding='valid', activation='relu',
name='conv1d_{}'.format(i)) for i in range(len(self._conv_filters))]
if self._batch_norm:
self._conv_batch_norm = tf.keras.layers.BatchNormalization(name='bn_conv_1d')
if self._rnn_type == 'gru':
rnn_init = tf.keras.layers.GRU
elif self._rnn_type == 'lstm':
rnn_init = tf.keras.layers.LSTM
else:
raise Exception("Invalid rnn_type: '{}' (must be 'lstm' or 'gru')"
.format(self._rnn_type))
self._rnn = []
for i, r in enumerate(self._recur_sizes):
layer = rnn_init(r, activation='relu', return_sequences=True,
name='{}_{}'.format(self._rnn_type, i))
if self._bidirect_rnn:
layer = tf.keras.layers.Bidirectional(layer)
self._rnn.append(layer)
if self._batch_norm:
self._rnn.append(tf.keras.layers.BatchNormalization())
self._fc = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
self._vocab_size, name='fc', activation='linear'))
self._optimizer = tf.keras.optimizers.Adam(lr=self._learning_rate)
def __call__(self, specs):
with tf.name_scope(self._name):
feats = specs
for layer in self._convolution:
feats = layer(feats)
if self._conv_batch_norm:
feats = self._conv_batch_norm(feats)
rnn_outputs = feats
for layer in self._rnn:
rnn_outputs = layer(rnn_outputs)
outputs = self._fc(rnn_outputs)
return tf.transpose(outputs, (1, 0, 2))
#tf.function
def train_step(self, specs, spec_lengths, labels, label_lengths):
with tf.GradientTape() as tape:
logits = self.__call__(specs)
loss = tf.nn.ctc_loss(labels=labels, logits=logits,
label_length=label_lengths, logit_length=spec_lengths)
cost = tf.reduce_sum(loss)
decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(logits, label_lengths)
gradients = tape.gradient(cost, self.trainable_variables)
self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return (decoded[0].indices, decoded[0].values, decoded[0].dense_shape), cost
I am currently getting the following error
ValueError: No gradients provided for any variable: ['DeepSpeech2/conv1d_0/kernel:0', 'DeepSpeech2/conv1d_0/bias:0', 'DeepSpeech2/bn_conv_1d/gamma:0', 'DeepSpeech2/bn_conv_1d/beta:0', 'DeepSpeech2/gru_0/kernel:0', 'DeepSpeech2/gru_0/recurrent_kernel:0', 'DeepSpeech2/gru_0/bias:0', 'DeepSpeech2/batch_normalization_v2/gamma:0', 'DeepSpeech2/batch_normalization_v2/beta:0', 'DeepSpeech2/time_distributed/kernel:0', 'DeepSpeech2/time_distributed/bias:0'].
The error occurs at the line where the gradients are applied to the optimizer. When I print out my gradients variable, it is just a list of None
From what I understand, this error is indicating that there is no path from the variables to the loss in the graph but I'm not sure why I am getting this. Any help would be greatly appreciated!

Multiple parameterized metrics in Keras

Let's say I have a custom parameterized metric:
def my_metric_at_k(k):
def my_metric(y_true, y_pred):
'''
Do magic here with k
to get metric_value
'''
return metric_value
return my_metric
I want to evaluate my model with my metric on multiple values of k, say k=1 and k=2, so naturally, I'm inclined to feed [my_metric_at_k(1), my_metric_at_k(2)] into metrics input of the compile method.
When I go to fit my model, I run into this error
NotFoundError: FetchOutputs node metrics/my_metric_1/...: not found
I'm using Keras with a TensorFlow backend.
For a particular example, one metric I use is the following:
def variety_first_k(k):
def variety(y_true, y_pred):
y_pred = K.tf.cast(y_pred, dtype=K.tf.float64)
sort = K.tf.nn.top_k(y_pred, k=k, sorted=True).indices
unique, _ = K.tf.unique(K.flatten(sort))
num_unique = K.shape(unique)[0]
return K.tf.cast(num_unique, dtype=K.tf.float64) /
K.tf.cast(K.tf.shape(y_pred)[1], dtype=K.tf.float64)
return variety
I'm running a multi-label multi-class problem.
A simplified version of my model is:
metrics = [variety_first_k(1), variety_first_k(2)]
inputs = layers.Input(shape=(my_data.shape[1],))
merged_layer = layers.BatchNormalization()(inputs)
merged_layer = layers.Dense(256, activation='relu',
kernel_regularizer=l1_l2(l1=0.5, l2=0.5))(merged_layer)
merged_layer = layers.BatchNormalization()(merged_layer)
predictions = layers.Dense(len(mlb.classes_))(merged_layer)
predictions = layers.Activation('sigmoid')(predictions)
model = keras.Model(inputs=inputs, outputs=predictions)
EPOCHS = 100
INIT_LR = 1e-2
BS = 128
opt = keras.optimizers.Adadelta(lr=INIT_LR, decay=INIT_LR / EPOCHS)
model.compile(loss=multilabel,
optimizer=opt,
metrics=metrics)
H = model.fit(x=my_data,
y=Y,
validation_split=0.02,
epochs=EPOCHS,
batch_size=BS)
The result gives me:
Train on 260563 samples, validate on 5318 samples
Epoch 1/100
...
...
NotFoundError: FetchOutputs node metrics/...: not found

Failing to train SkipGram word embedding in Pytorch

I am training the skipgram word embeddings using the famous model described in https://arxiv.org/abs/1310.4546. I want to train it in PyTorch but I am getting errors and I can't figure out where they are coming from. Below I have provided my model class, training loop, and batching method. Does anyone have any insight into whats going on?
I am getting an error on the output = loss(data, target) line. It is having a problem with <class 'torch.LongTensor'> which is weird because CrossEntropyLoss takes a long tensor. The output shape might be wrong which is: torch.Size([1000, 100, 1000]) after the feedforward.
I have my model defined as:
import torch
import torch.nn as nn
torch.manual_seed(1)
class SkipGram(nn.Module):
def __init__(self, vocab_size, embedding_dim):
super(SkipGram, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.hidden_layer = nn.Linear(embedding_dim, vocab_size)
# Loss needs to be input: (minibatch (N), C) target: (minibatch, 1), each label is a class
# Calculate loss in training
def forward(self, x):
embeds = self.embeddings(x)
x = self.hidden_layer(embeds)
return x
My training is defined as:
import torch.optim as optim
from torch.autograd import Variable
net = SkipGram(1000, 300)
optimizer = optim.SGD(net.parameters(), lr=0.01)
batch_size = 100
size = len(train_ints)
batches = batch_index_gen(batch_size, size)
inputs, targets = build_tensor_from_batch_index(batches[0], train_ints)
for i in range(100):
running_loss = 0.0
for batch_idx, batch in enumerate(batches):
data, target = build_tensor_from_batch_index(batch, train_ints)
# if (torch.cuda.is_available()):
# data, target = data.cuda(), target.cuda()
# net = net.cuda()
data, target = Variable(data), Variable(target)
optimizer.zero_grad()
output = net.forward(data)
loss = nn.CrossEntropyLoss()
output = loss(data, target)
output.backward()
optimizer.step()
running_loss += loss.data[0]
optimizer.step()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
i, batch_idx * len(batch_size), len(size),
100. * (batch_idx * len(batch_size)) / len(size), loss.data[0]))
If useful my batching is:
def build_tensor_from_batch_index(index, train_ints):
minibatch = []
for i in range(index[0], index[1]):
input_arr = np.zeros( (1000,1), dtype=np.int )
target_arr = np.zeros( (1000,1), dtype=np.int )
input_index, target_index = train_ints[i]
input_arr[input_index] = 1
target_arr[input_index] = 1
input_tensor = torch.from_numpy(input_arr)
target_tensor = torch.from_numpy(target_arr)
minibatch.append( (input_tensor, target_tensor) )
# Concatenate all tensors into a minibatch
#x = [tensor[0] for tensor in minibatch]
#print(x)
input_minibatch = torch.cat([tensor[0] for tensor in minibatch], 1)
target_minibatch = torch.cat([tensor[1] for tensor in minibatch], 1)
#target_minibatch = minibatch[0][1]
return input_minibatch, target_minibatch
I'm not sure about that since I did not read the paper, but seems weird that you are computing the loss with the original data and the targets:
output = loss(data, target)
Considering that the output of the network is output = net.forward(data) I think you should compute your loss as:
error = loss(output, target)
If this doesn't help, briefly point me out what the paper says about the loss function.

Resources