Simple vanilla RNN doesn't pass gradient check - python-3.x

I recently tried to implement a vanilla RNN from scratch. I implemented everything and even ran a seemingly OK example! yet I noticed the gradient check is not successful! and only some parts (specifically weight and bias for the output) pass the gradient check while other weights (Whh, Whx) don't pass it.
I followed karpathy/corsera's implementation and made sure everything is implemented. Yet karpathy/corsera's code passes the gradient check and mine doesn't. I have no clue at this point, what is causing this!
Here is the snippets responsible for backward pass in the original code :
def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
gradients['dWya'] += np.dot(dy, a.T)
gradients['dby'] += dy
da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] # backprop into h
daraw = (1 - a * a) * da # backprop through tanh nonlinearity
gradients['db'] += daraw
gradients['dWax'] += np.dot(daraw, x.T)
gradients['dWaa'] += np.dot(daraw, a_prev.T)
gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
return gradients
def rnn_backward(X, Y, parameters, cache):
# Initialize gradients as an empty dictionary
gradients = {}
# Retrieve from cache and parameters
(y_hat, a, x) = cache
Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
# each one should be initialized to zeros of the same dimension as its corresponding parameter
gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
gradients['da_next'] = np.zeros_like(a[0])
### START CODE HERE ###
# Backpropagate through time
for t in reversed(range(len(X))):
dy = np.copy(y_hat[t])
# this means, subract the correct answer from the predicted value (1-the predicted value which is specified by Y[t])
dy[Y[t]] -= 1
gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])
### END CODE HERE ###
return gradients, a
and this is my implementation:
def rnn_cell_backward(self, xt, h, h_prev, output, true_label, dh_next):
"""
Runs a single backward pass once.
Inputs:
- xt: The input data of shape (Batch_size, input_dim_size)
- h: The next hidden state at timestep t(which comes from the forward pass)
- h_prev: The previous hidden state at timestep t-1
- output : The output at the current timestep
- true_label: The label for the current timestep, used for calculating loss
- dh_next: The gradient of hidden state h (dh) which in the beginning
is zero and is updated as we go backward in the backprogagation.
the dh for the next round, would come from the 'dh_prev' as we will see shortly!
Just remember the backward pass is essentially a loop! and we start at the end
and traverse back to the beginning!
Returns :
- dW1 : The gradient for W1
- dW2 : The gradient for W2
- dW3 : The gradient for W3
- dbh : The gradient for bh
- dbo : The gradient for bo
- dh_prev : The gradient for previous hiddenstate at timestep t-1. this will be used
as the next dh for the next round of backpropagation.
- per_ts_loss : The loss for current timestep.
"""
e = np.copy(output)
# correct idx for each row(sample)!
idxs = np.argmax(true_label, axis=1)
# number of rows(samples) in our batch
rows = np.arange(e.shape[0])
# This is the vectorized version of error_t = output_t - label_t or simply e = output[t] - 1
# where t refers to the index in which label is 1.
e[rows, idxs] -= 1
# This is used for our loss to see how well we are doing during training.
per_ts_loss = output[rows, idxs].sum()
# must have shape of W3 which is (vocabsize_or_output_dim_size, hidden_state_size)
dW3 = np.dot(e.T, h)
# dbo = e.1, since we have batch we use np.sum
# e is a vector, when it is subtracted from label, the result will be added to dbo
dbo = np.sum(e, axis=0)
# when calculating the dh, we also add the dh from the next timestep as well
# when we are in the last timestep, the dh_next is initially zero.
dh = np.dot(e, self.W3) + dh_next # from later cell
# the input part
dtanh = (1 - h * h) * dh
# dbh = dtanh.1, we use sum, since we have a batch
dbh = np.sum(dtanh, axis=0)
# compute the gradient of the loss with respect to W1
# this is actually not needed! we only care about tune-able
# parameters, so we are only after, W1,W2,W3, db and do
# dxt = np.dot(dtanh, W1.T)
# must have the shape of (vocab_size, hidden_state_size)
dW1 = np.dot(xt.T, dtanh)
# compute the gradient with respect to W2
dh_prev = np.dot(dtanh, self.W2)
# shape must be (HiddenSize, HiddenSize)
dW2 = np.dot(h_prev.T, dtanh)
return dW1, dW2, dW3, dbh, dbo, dh_prev, per_ts_loss
def rnn_layer_backward(self, Xt, labels, H, O):
"""
Runs a full backward pass on the given data. and returns the gradients.
Inputs:
- Xt: The input data of shape (Batch_size, timesteps, input_dim_size)
- labels: The labels for the input data
- H: The hiddenstates for the current layer prodced in the foward pass
of shape (Batch_size, timesteps, HiddenStateSize)
- O: The output for the current layer of shape (Batch_size, timesteps, outputsize)
Returns :
- dW1: The gradient for W1
- dW2: The gradient for W2
- dW3: The gradient for W3
- dbh: The gradient for bh
- dbo: The gradient for bo
- dh: The gradient for the hidden state at timestep t
- loss: The current loss
"""
dW1 = np.zeros_like(self.W1)
dW2 = np.zeros_like(self.W2)
dW3 = np.zeros_like(self.W3)
dbh = np.zeros_like(self.bh)
dbo = np.zeros_like(self.bo)
dh_next = np.zeros_like(H[:, 0, :])
hprev = None
_, T_x, _ = Xt.shape
loss = 0
for t in reversed(range(T_x)):
# this if-else block can be removed! and for hprev, we can simply
# use H[:,t -1, : ] instead, but I also add this in case it makes a
# a difference! so far I have not seen any difference though!
if t > 0:
hprev = H[:, t - 1, :]
else:
hprev = np.zeros_like(H[:, 0, :])
dw_1, dw_2, dw_3, db_h, db_o, dh_prev, e = self.rnn_cell_backward(Xt[:, t, :],
H[:, t, :],
hprev,
O[:, t, :],
labels[:, t, :],
dh_next)
dh_next = dh_prev
dW1 += dw_1
dW2 += dw_2
dW3 += dw_3
dbh += db_h
dbo += db_o
# Update the loss by substracting the cross-entropy term of this time-step from it.
loss -= np.log(e)
return dW1, dW2, dW3, dbh, dbo, dh_next, loss
I have commented everything and provided a minimal example to demonstrate this here:
My code (doesn't pass gradient check)
And here is the implementation that I used as my guide. This is from karpathy/Coursera and passes all the gradient checks!: original code
At this point I have no idea why this is not working. I'm a beginner in Python so, this could be why I can't find the issue.

2 month later I think I found the culprit! I should have changed the following line :
# compute the gradient with respect to W2
dh_prev = np.dot(dtanh, self.W2)
to
# compute the gradient with respect to W2
# note the transpose here!
dh_prev = np.dot(dtanh, self.W2.T)
When I was initially writing the backward pass, I only paid attention to the dimensions and that made me make this mistake. This is actually an example of messing features that can happen in mindless/blind reshaping/transposing(or not doing so!)
In order to get what has gone wrong here let me give an example.
Suppose we have a matrix of peoples features and we dedicated each row to each person, therefore our matrix would look like this :
Features | Age | height(cm) | weight(kg) |
matrix = | 20 | 185 | 75 |
| 85 | 155 | 95 |
| 40 | 205 | 120 |
Now if we make this into a numpy array we will have the following :
m = np.array([[20, 185, 75],
[85, 155, 95],
[40, 205, 120]])
A simple 3x3 array right?
Now the way we interpret our matrix is very important, here each row and each column has a specific meaning. Each person is described using a row, and each column is a specific feature vector.
So, you see there is a "structure" in the matrix we represent our data with.
In other words, each data item is represented as a row, and each column specifies a single feature. When multiplying with another matrix, this semantic should be paid attention to ,meaning, when two matrices are to be multiplied, each data row must have this semantic.
Lets have an example and make this more clear :
suppose we have two matrices :
m1 = np.array([[20, 185, 75],
[85, 155, 95],
[40, 205, 120]])
m2 = np.array([[0.9, 0.8, 0.85],
[0.1, 0.5, 0.4],
[0.6, 0.9, 0.8]])
these two matrices contain data that are arranged in rows, therefore, multiplying them would result in the correct answer, However altering the order of data using Transpose for example, will destroy the semantic and we will be multiplying unrelated data!
In my case I needed to transpose the second matrix it to make the order right
for the operation at hand! and that fixed the gradient checking hopefully!

Related

Pytorch: Custom thresholding activation function - gradient

I created an activation function class Threshold that should operate on one-hot-encoded image tensors.
The function performs min-max feature scaling on each channel followed by thresholding.
class Threshold(nn.Module):
def __init__(self, threshold=.5):
super().__init__()
if threshold < 0.0 or threshold > 1.0:
raise ValueError("Threshold value must be in [0,1]")
else:
self.threshold = threshold
def min_max_fscale(self, input):
r"""
applies min max feature scaling to input. Each channel is treated individually.
input is assumed to be N x C x H x W (one-hot-encoded prediction)
"""
for i in range(input.shape[0]):
# N
for j in range(input.shape[1]):
# C
min = torch.min(input[i][j])
max = torch.max(input[i][j])
input[i][j] = (input[i][j] - min) / (max - min)
return input
def forward(self, input):
assert (len(input.shape) == 4), f"input has wrong number of dims. Must have dim = 4 but has dim {input.shape}"
input = self.min_max_fscale(input)
return (input >= self.threshold) * 1.0
When I use the function I get the following error, since the gradients are not calculated automatically I assume.
Variable._execution_engine.run_backward(RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
I already had a look at How to properly update the weights in PyTorch? but could not get a clue how to apply it to my case.
How is it possible to calculate the gradients for this function?
Thanks for your help.
The issue is you are manipulating and overwriting elements, this time of operation can't be tracked by autograd. Instead, you should stick with built-in functions. You example is not that tricky to tackle: you are looking to retrieve the minimum and maximum values along input.shape[0] x input.shape[1]. Then you will scale your whole tensor in one go i.e. in vectorized form. No for loops involved!
One way to compute min/max along multiple axes is to flatten those:
>>> x_f = x.flatten(2)
Then, find the min-max on the flattened axis while retaining all shapes:
>>> x_min = x_f.min(axis=-1, keepdim=True).values
>>> x_max = x_f.max(axis=-1, keepdim=True).values
The resulting min_max_fscale function would look something like:
class Threshold(nn.Module):
def min_max_fscale(self, x):
r"""
Applies min max feature scaling to input. Each channel is treated individually.
Input is assumed to be N x C x H x W (one-hot-encoded prediction)
"""
x_f = x.flatten(2)
x_min, x_max = x_f.min(-1, True).values, x_f.max(-1, True).values
x_f = (x_f - x_min) / (x_max - x_min)
return x_f.reshape_as(x)
Important note:
You would notice that you can now backpropagate on min_max_fscale... but not on forward. This is because you are applying a boolean condition which is not a differentiable operation.

how can I insert a Tensor into another Tensor in pytorch

I have pytorch Tensor with shape (batch_size, step, vec_size), for example, a Tensor(32, 64, 128), let's call it A.
I have another Tensor(batch_size, vec_size), e.g. Tensor(32, 128), let's call it B.
I want to insert B into a certain position at axis 1 of A. The insert positions are given in a Tensor(batch_size), named P.
I understand there is no Empty tensor(like an empty list) in pytorch, so, I initialize A as zeros, and add B at a certain position at axis 1 of A.
A = Variable(torch.zeros(batch_size, step, vec_size))
What I'm doing is like:
for i in range(batch_size):
pos = P[i]
A[i][pos] = A[i][pos] + B[i]
But I get an Error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
Then, I make a clone of A each inside the loop:
for i in range(batch_size):
A_clone = A.clone()
pos = P[i]
A_clone[i][pos] = A_clone[i][pos] + B[i]
This is very slow for autograd, I wonder if there any better solutions? Thank you.
You can use a mask instead of cloning.
See the code below
# setup
batch, step, vec_size = 64, 10, 128
A = torch.rand((batch, step, vec_size))
B = torch.rand((batch, vec_size))
pos = torch.randint(10, (64,)).long()
# computations
# create a mask where pos is 0 if it is to be replaced
mask = torch.ones( (batch, step)).view(batch,step,1).float()
mask[torch.arange(batch), pos]=0
# expand B to have same dimension as A and compute the result
result = A*mask + B.unsqueeze(dim=1).expand([-1, step, -1])*(1-mask)
This way you avoid using for loops and cloning as well.

How to make this 1 cell LSTM network?

I would like to make an LSTM network to learn to give me back the first value of the sequence each time there is a 0 in the sequence and 0 if there is another value.
Example:
x = 9 8 3 1 0 3 4
y = 0 0 0 0 9 0 0
The network memorize a value and give it back when it receives a special signal.
I think a can do that with one LSTM cell like that:
in red the weights and inside the gray area the biases.
Here is my model:
model2=Sequential()
model2.add(LSTM(input_dim=1, output_dim=1, return_sequences = True))
model2.add(TimeDistributed(Dense(output_dim=1, activation='linear')))
model2.compile(loss = "mse", optimizer = "rmsprop")
and here how I set the weigths to my cell, however I not sure at all of the order :
# w : weights of x_t
# u : weights of h_{t-1}
# order of array: input_gate, new_input, forget_gate, output_gate
# (Tensorflow order)
w = np.array([[0, 1, 0, -100]], dtype=np.float32)
u = np.array([[1, 0, 0, 0]], dtype=np.float32)
biases = np.array([0, 0, 1, 1], dtype=np.float32)
model2.get_layer('lstm').set_weights([w, u, biases])
Am I right with the weigths? Is it as I put it on the figure?
To work it needs to have the right inital values. How do I set the initial values c of the cell and h of the previous output ? I seen that in the source code
h_tm1 = states[0] # previous memory state
c_tm1 = states[1] # previous carry state
but I couldn't find how to use that.
Why not do this manually? It's so easy and it's an exact calculation. You don't need weights for that, and that is certainly not differentiable regarding weights.
Given an input tensor with shape (batch, steps, features):
def processSequence(x):
initial = x[:,0:1]
zeros = K.cast(K.equal(x,0), K.floatx())
return initial * zeros
model.add(Lambda(processSequence))
Warning: if you're intending to use this with inputs from other layers, the probability of finding a zero will be so small that this layer will be useless.

How to set up the number of inputs neurons in sklearn MLPClassifier?

Given a dataset of n samples, m features, and using [sklearn.neural_network.MLPClassifier][1], how can I set hidden_layer_sizes to start with m inputs? For instance, I understand that if hidden_layer_sizes= (10,10) it means there are 2 hidden layers each of 10 neurons (i.e., units) but I don't know if this also implies 10 inputs as well.
Thank you
This classifier/regressor, as implemented, is doing this automatically when calling fit.
This can be seen in it's code here.
Excerpt:
n_samples, n_features = X.shape
# Ensure y is 2D
if y.ndim == 1:
y = y.reshape((-1, 1))
self.n_outputs_ = y.shape[1]
layer_units = ([n_features] + hidden_layer_sizes +
[self.n_outputs_])
You see, that your potentially given hidden_layer_sizes is surrounded by layer-dimensions defined by your data within .fit(). This is the reason, the signature reads like this with a subtraction of 2!:
Parameters
hidden_layer_sizes : tuple, length = n_layers - 2, default (100,)
The ith element represents the number of neurons in the ith hidden layer.

LSTMLayer produces NaN values even before training it

I'm currently trying to construct a LSTM network with Lasagne to predict the next step of noisy sequences. I first trained a stack of 2 LSTM layers for a while, but had to use an abysmally small learning rate (1e-6) because of divergence issues (that ultimately produced NaN values). The results were kind of disappointing, as the network produced smooth, out-of-phase versions of the input.
I then came to the conclusion I should use better parameter initialization than what is given by default. The goal was to start from a network that just mimics identity, since for strongly auto-correlated signal it should be a good first estimation of the next step (x(t) ~ x(t+1)), and to sprinkle a bit of noise on top of it.
import theano, numpy, lasagne
from theano import tensor as T
from lasagne.layers.recurrent import LSTMLayer, InputLayer, Gate
from lasagne.layers import DropoutLayer
from lasagne.nonlinearities import sigmoid, tanh, leaky_rectify
from lasagne.layers import get_output
from lasagne.init import GlorotNormal, Normal, Constant
floatX = 'float32'
# function to create a lstm that ~ propagate the input from start to finish off the bat
# should be a good start for a predictive lstm with high one-step autocorrelation
def create_identity_lstm(input, shape, orig_inp=None, noiselvl=0.01, G=10., mask_input=None):
inp, out = shape
# orig_inp is used to limit the number of units that are actually used to pass the input information from one layer to the other - the rest of the units should produce ~ 0 activation.
if orig_inp is None:
orig_inp = inp
# input gate
inputgate = Gate(
W_in=GlorotNormal(noiselvl),
W_hid=GlorotNormal(noiselvl),
W_cell=Normal(noiselvl),
b=Constant(0.),
nonlinearity=sigmoid
)
# forget gate
forgetgate = Gate(
W_in=GlorotNormal(noiselvl),
W_hid=GlorotNormal(noiselvl),
W_cell=Normal(noiselvl),
b=Constant(0.),
nonlinearity=sigmoid
)
# cell gate
cell = Gate(
W_in=GlorotNormal(noiselvl),
W_hid=GlorotNormal(noiselvl),
W_cell=None,
b=Constant(0.),
nonlinearity=leaky_rectify
)
# output gate
outputgate = Gate(
W_in=GlorotNormal(noiselvl),
W_hid=GlorotNormal(noiselvl),
W_cell=Normal(noiselvl),
b=Constant(0.),
nonlinearity=sigmoid
)
lstm = LSTMLayer(input, out, ingate=inputgate, forgetgate=forgetgate, cell=cell, outgate=outputgate, nonlinearity=leaky_rectify, mask_input=mask_input)
# change matrices and biases
# ingate - should return ~1 (matrices = 0, big bias)
b_i = lstm.b_ingate.get_value()
b_i[:orig_inp] += G
lstm.b_ingate.set_value(b_i)
# forgetgate - should return 0 (matrices = 0, big negative bias)
b_f = lstm.b_forgetgate.get_value()
b_f[:orig_inp] -= G
b_f[orig_inp:] += G # to help learning future features, I preserve a large bias on "unused" units to help it remember stuff
lstm.b_forgetgate.set_value(b_f)
# cell - should return x(t) (W_xc = identity, rest is 0)
W_xc = lstm.W_in_to_cell.get_value()
for i in xrange(orig_inp):
W_xc[i, i] += 1.
lstm.W_in_to_cell.set_value(W_xc)
# outgate - should return 1 (same as ingate)
b_o = lstm.b_outgate.get_value()
b_o[:orig_inp] += G
lstm.b_outgate.set_value(b_o)
# done
return lstm
I then use this lstm generation code to generate the following network:
# layers
#input + dropout
input = InputLayer((None, None, 7), name='input')
mask = InputLayer((None, None), name='mask')
drop1 = DropoutLayer(input, p=0.33)
#lstm1 + dropout
lstm1 = create_identity_lstm(drop1, (7, 1024), mask_input=mask)
drop2 = DropoutLayer(lstm1, p=0.33)
#lstm2 + dropout
lstm2 = create_identity_lstm(drop2, (1024, 128), orig_inp=7, mask_input=mask)
drop3 = DropoutLayer(lstm2, p=0.33)
#lstm3
lstm3 = create_identity_lstm(drop3, (128, 7), orig_inp=7, mask_input=mask)
# symbolic variables and prediction
x = input.input_var
ma = mask.input_var
ma_reshape = ma.dimshuffle((0,1,'x'))
yhat = get_output(lstm3, deterministic=False)
yhat_det = get_output(lstm3, deterministic=True)
y = T.ftensor3('y')
predict = theano.function([x, ma], yhat_det)
Problem is, even without any training, this network produces garbage values and sometimes even a bunch of NaNs, right from the very first LSTM layer:
X = numpy.random.random((5, 10000, 7)).astype('float32')
Masks = numpy.ones(X.shape[:2], dtype='float32')
hid1 = get_output(lstm1, determistic=True)
get_hid1 = theano.function([x, ma], hid1)
h1 = get_hid1(X, Masks)
print numpy.isnan(h1).sum(axis=1).sum(axis=1)
array([6379520, 6367232, 6377472, 6376448, 6378496])
# even the first output value is garbage!
print h1[:,0,0] - X[:,0,0]
array([-0.03898358, -0.10118812, 0.34877831, -0.02509735, 0.36689138], dtype=float32)
I don't get why, I checked each matrices and their values are fine, like I wanted them to be. I even tried to recreate each gate activations and the resulting hidden activations using the actual numpy arrays and they reproduce the input just fine. What did I do wrong there??

Resources