I wonder if I want to implement dropout by myself, is something like the following sufficient (taken from Implementing dropout from scratch):
class MyDropout(nn.Module):
def __init__(self, p: float = 0.5):
super(MyDropout, self).__init__()
if p < 0 or p > 1:
raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
self.p = p
def forward(self, X):
if self.training:
binomial = torch.distributions.binomial.Binomial(probs=1-self.p)
return X * binomial.sample(X.size()) * (1.0/(1-self.p))
return X
My concern is even if the unwanted weights are masked out (either through this way or by using a mask tensor), there can still be gradient flow through the 0 weights (https://discuss.pytorch.org/t/custom-connections-in-neural-network-layers/3027/9). Is my concern valid?
DropOut does not mask the weights - it masks the features.
For linear layers implementing y = <w, x> the gradient w.r.t the parameters w is x. Therefore, if you set entries in x to zero - it will amount to no update for the corresponding weight in the adjacent linear layer.
Related
I'm trying to train a network in pytorch along the lines of this idea.
The author creates a simple MLP (4 hidden layers) and then explicitly works out what the partial derivatives of the output is wrt the inputs. He then trains the network on the training labels as well as the gradients of the output wrt the input data (which is also part of the training data).
To replicate the idea in pytorch, my training loop looks like this:
import torch
import torch.nn.functional as F
class vanilla_net(torch.nn.Module):
def __init__(self,
input_dim, # dimension of inputs, e.g. 10
hidden_units, # units in hidden layers, assumed constant, e.g. 20
hidden_layers): # number of hidden layers, e.g. 4):
super(vanilla_net, self).__init__()
self.input = torch.nn.Linear(input_dim, hidden_units)
self.hidden = torch.nn.ModuleList()
for hl in range(hidden_layers):
layer = torch.nn.Linear(hidden_units, hidden_units)
self.hidden.append(layer)
self.output = torch.nn.Linear(hidden_units, 1)
def forward(self, x):
x = self.input(x)
x = F.softplus(x)
for h in self.hidden:
x = h(x)
x = F.softplus(x)
x = self.output(x)
return x
....
def lossfn(x, y, dx, dy):
# some loss function involving both sets of training data (y and dy)
# the network outputs x and what's needed is an efficient way of calculating dx - the partial
# derivatives of x wrt the batch inputs.
pass
def train(net, x_train, y_train, dydx_train, batch_size=256)
m, n = x_train.shape
first = 0
last = min(batch_size, m)
while first < m:
xi = x_train[first:last]
yi = y_train[first:last]
zi = dydx_train[first:last]
xi.requires_grad_()
# Perform forward pass
outputs = net(xi)
minimizer.zero_grad()
outputs.backward(torch.ones_like(outputs), create_graph=True)
xi_grad = xi.grad
# Compute loss
loss = lossfn(outputs, yi, xi_grad, zi)
minimizer.zero_grad()
# Perform backward pass
loss.backward()
# Perform optimization
minimizer.step()
first = last
last = min(first + batch_size, m)
net = vanilla_net(4, 10, 4)
minimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
...
This seems to work but is there a more elegant/efficient way to achieve the same thing? Also - not sure I know where the best place to put the minimizer.zero_grad()
Thanks
I am trying to reproduce a unet result on Carvana dataset using Ternausnet in PyTorch using Lightning.
I am using DiceLoss for that with sigmoid activation function. I think I am running into an issue of a vanishing gradient, because all gradients of weights are 0, and I see the output of the network with min value of order 10^8.
What could be the issue here? How can I address the vanishing gradient? Also, if I use a different criterion, I see a problem of loss going into negative values without stopping (for BCE with logits, for instance).
Here is the code for my Dice loss:
class DiceLoss(nn.Module):
def __init__(self):
super().__init__()
def forward(self, logits, targets, eps=0, threshold=None):
# comment out if your model contains a sigmoid or
# equivalent activation layer
proba = torch.sigmoid(logits)
proba = proba.view(proba.shape[0], 1, -1)
targets = targets.view(targets.shape[0], 1, -1)
if threshold:
proba = (proba > threshold).float()
# flatten label and prediction tensors
intersection = torch.sum(proba * targets, dim=1)
summation = torch.sum(proba, dim=1) + torch.sum(targets, dim=1)
dice = (2.0 * intersection + eps) / (summation + eps)
# print(intersection, summation, dice)
return (1 - dice).mean()
I have a neural network. For simplicity, there's only one layer and the weight matrix is of shape 2-by-2. I need the output of the network to be the rotated version of the input, i.e., the matrix should be a valid rotation matrix. I have tried the following:
def rotate(val):
w1 = tf.constant_initializer([[cos45, -sin45], [sin45, cos45]])
return tf.layers.dense(inputs=val, units=2, kernel_initializer=w1, activation=tf.nn.tanh)
While training, I do not want to lose properties of the rotation matrix. In other words, I need the layer(s) to estimate only the angle (argument) of trigonometric functions in the matrix.
I read that kernel_constraint can help in this aspect, by normalizing the values. But applying kernel_constraint does not guarantee diagonal entries being equal and the off diagonal entries being negatives of each other (in this case). In general, the two properties that need to be satisfied are, the determinant should be 1 and R^T*R = I.
Is there any other way to achieve this?
You could define your custom Keras layer. Something along the lines of:
from tensorflow.keras.layers import Layer
import tensorflow as tf
class Rotate(Layer):
def build(self, input_shape):
sh = input_shape[0]
shape = [sh, sh]
# Initial weight matrix
w = self.add_weight(shape=shape,
initializer='random_uniform')
# Set upper diagonal elements to negative of lower diagonal elements
mask = tf.cast(tf.linalg.band_part(tf.ones(shape), -1, 0), tf.float32)
w = mask * w
w -= tf.transpose(w)
# Set the same weight to the diagonal
diag_mask = 1 - tf.linalg.diag(tf.ones(sh))
w = diag_mask * w
diag_w = self.add_weight(shape=(1,),
initializer='random_uniform')
diagonal = tf.linalg.diag(tf.ones(sh)) * diag_w
self.kernel = w + diagonal
def call(self, inputs, **kwargs):
return tf.matmul(inputs, self.kernel)
Note that the matrix of learnable weights self.kernel has this aspect: [[D, -L], [L, D]]
Given a simple 2 layer neural network, the traditional idea is to compute the gradient w.r.t. the weights/model parameters. For an experiment, I want to compute the gradient of the error w.r.t the input. Are there existing Pytorch methods that can allow me to do this?
More concretely, consider the following neural network:
import torch.nn as nn
import torch.nn.functional as F
class NeuralNet(nn.Module):
def __init__(self, n_features, n_hidden, n_classes, dropout):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(n_features, n_hidden)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(n_hidden, n_classes)
self.dropout = dropout
def forward(self, x):
x = self.sigmoid(self.fc1(x))
x = F.dropout(x, self.dropout, training=self.training)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
I instantiate the model and an optimizer for the weights as follows:
import torch.optim as optim
model = NeuralNet(n_features=args.n_features,
n_hidden=args.n_hidden,
n_classes=args.n_classes,
dropout=args.dropout)
optimizer_w = optim.SGD(model.parameters(), lr=0.001)
While training, I update the weights as usual. Now, given that I have values for the weights, I should be able to use them to compute the gradient w.r.t. the input. I am unable to figure out how.
def train(epoch):
t = time.time()
model.train()
optimizer.zero_grad()
output = model(features)
loss_train = F.nll_loss(output[idx_train], labels[idx_train])
acc_train = accuracy(output[idx_train], labels[idx_train])
loss_train.backward()
optimizer_w.step()
# grad_features = loss_train.backward() w.r.t to features
# features -= 0.001 * grad_features
for epoch in range(args.epochs):
train(epoch)
It is possible, just set input.requires_grad = True for each input batch you're feeding in, and then after loss.backward() you should see that input.grad holds the expected gradient. In other words, if your input to the model (which you call features in your code) is some M x N x ... tensor, features.grad will be a tensor of the same shape, where each element of grad holds the gradient with respect to the corresponding element of features. In my comments below, I use i as a generalized index - if your parameters has for instance 3 dimensions, replace it with features.grad[i, j, k], etc.
Regarding the error you're getting: PyTorch operations build a tree representing the mathematical operation they are describing, which is then used for differentiation. For instance c = a + b will create a tree where a and b are leaf nodes and c is not a leaf (since it results from other expressions). Your model is the expression, and its inputs as well as parameters are the leaves, whereas all intermediate and final outputs are not leaves. You can think of leaves as "constants" or "parameters" and of all other variables as of functions of those. This message tells you that you can only set requires_grad of leaf variables.
Your problem is that at the first iteration, features is random (or however else you initialize) and is therefore a valid leaf. After your first iteration, features is no longer a leaf, since it becomes an expression calculated based on the previous ones. In pseudocode, you have
f_1 = initial_value # valid leaf
f_2 = f_1 + your_grad_stuff # not a leaf: f_2 is a function of f_1
to deal with that you need to use detach, which breaks the links in the tree, and makes the autograd treat a tensor as if it was constant, no matter how it was created. In particular, no gradient calculations will be backpropagated through detach. So you need something like
features = features.detach() - 0.01 * features.grad
Note: perhaps you need to sprinkle a couple more detaches here and there, which is hard to say without seeing your whole code and knowing the exact purpose.
I am trying to implement custom dropout layer. During forward propagation, I'd like my inputs to pass as it is without any dropout. During backward pass, I'd like to update gradient of only some inputs while freezing gradient of others. This would be based on a probability which decides what gradients to update and what to freeze.
I have implemented a custom layer, however as the modification is subtle, it is difficult to verify if it is correct. It is possible to get reasonable output with incorrect implementation. I have modified existing dropout function in Keras.
class MyDropout(Layer):
"""Applies Dropout to the input.
Dropout consists in randomly setting
a fraction `rate` of input units to 0 at each update during training time,
which helps prevent overfitting.
# Arguments
rate: float between 0 and 1. Fraction of the input units to drop.
noise_shape: 1D integer tensor representing the shape of the
binary dropout mask that will be multiplied with the input.
For instance, if your inputs have shape
`(batch_size, timesteps, features)` and
you want the dropout mask to be the same for all timesteps,
you can use `noise_shape=(batch_size, 1, features)`.
seed: A Python integer to use as random seed.
# References
- [Dropout: A Simple Way to Prevent Neural Networks from Overfitting](
http://www.jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf)
"""
def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
super(MyDropout, self).__init__(**kwargs)
self.rate = min(1., max(0., rate))
self.noise_shape = noise_shape
self.seed = seed
self.supports_masking = True
def _get_noise_shape(self, inputs):
if self.noise_shape is None:
return self.noise_shape
symbolic_shape = keras.backend.shape(inputs)
noise_shape = [symbolic_shape[axis] if shape is None else shape
for axis, shape in enumerate(self.noise_shape)]
return tuple(noise_shape)
def call(self, inputs, training=None):
if 0. < self.rate < 1.:
noise_shape = self._get_noise_shape(inputs)
# generate random number of same shape as input
uniform_random_number = keras.backend.random_normal(shape=keras.backend.shape(inputs))
# check where the random number if greater than the dropout rate
indices_greater_than = tf.greater(uniform_random_number,self.rate,name = 'stoppedGradientLocations')
indices_greater_than = tf.cast(indices_greater_than,dtype=tf.float32)
inputs_copy = tf.identity(inputs)
out1 = tf.stop_gradient(inputs_copy*indices_greater_than)
indices_less_than= 1 - indices_greater_than
out2 = inputs*indices_less_than
out_total = out1 + out2
return out_total
def get_config(self):
config = {'rate': self.rate,
'noise_shape': self.noise_shape,
'seed': self.seed}
base_config = super(Dropout, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape):
return input_shape
What is the best way to verify my implementation - is the code working as intended?