Reinitiate the weights and biases values - pytorch

I have a MLP network and i want to use the same MLP with different seed value. I want to know if i can reinitiate the weights and biases values as it should be in start while calling the MLP in loop.
class MLP(nn.Module):
def __init__(self, layers):
super(MLP_pre,self).__init__()
'activation function'
self.activation = nn.Tanh()
'loss function'
self.loss_function = nn.MSELoss(reduction ='mean')
'Initialise neural network as a list using nn.Modulelist'
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers)-1)])
'foward pass'
def forward(self,x):
for i in range(len(layers)-2):
z = self.linears[i](x)
x = self.activation(z)
x = self.linears[-1](x)
#print(x)
return x
NN = MLP(layers)
for seed in np.arange(0,3,1):
train_loss = NN(m2,max_iter,seed)
train_loss.append(train_loss)

Related

tf.GradientTape gradient() returns None

I am trying to train my keras model using TensorFlow, so far I can build the model,
def Model(input_shape, num_of_layers):
num_of_layers = 5
mod = keras.models.Sequential()
mod.add(keras.layers.Dense(1, input_shape = (input_shape,)))
for i in range(num_of_layers - 1):
mod.add(keras.layers.Dense(16, activation = 'tanh'))
mod.add(keras.layers.Dense(1, activation = 'tanh'))
return mod
and loss function.
def loss(u_pred, u_true):
return tf.reduce_mean(tf.keras.losses.mean_squared_error(u_pred, u_true))
Then I create a train function to train the model.
def train(model, X, epoch = 500, lr = 1e-3):
trainable_params = [tf.Variable(model.get_weights()[i]) for i in range(len(model.get_weights()))]
loss_array = []
optim = tf.keras.optimizers.Adam(learning_rate = lr)
for i in range(epoch):
with tf.GradientTape() as g:
g.watch(trainable_params)
loss_val = loss(model(X), tf.zeros_like(X))
grad = g.gradient(loss_val, trainable_params)
...
The grad returns a vector of None when I print it. What went wrong with my train function? I have converted my model's weights and biases to tensor object using tf.Variable. Using tf.cast or tf.convert_to_tensor doesn't help either.

LSTM multi-sequence input to one sequence output

I am new with neural networks and am currently trying to make an LSTM model that predicts an output sequence based on multiple parameters. Excuse my ignorance and dummyness in advance.
I have obtained training and validation datasets, which look somewhat like the following:
For every ID four rows are recorded, which uses columns holding certain parameters and the corresponding Y output. Practically, there are thus ~122,000 / 4 = ~30,500 samples (I mistakenly put 122,000 as ID, it is in fact the number of rows). Since the parameter values and the corresponding Y values follow temporal patterns, I am interested if a model such as LSTM improves the prediction.
I want to predict the Y in my validation dataset (~73,000/4 = ~18,000 samples), based on the temporal patterns of the parameters. But is this possible? Most tutorials I followed use a single sequence, for which an LSTM is used to extend a similar input sequence. I thus want an LSTM with 'multi-sequence' input, which outputs one sequence. How do I go about this?
I'm using PyTorch as framework. A simple LSTM model I created using a tutorial, which would not incorporate the parameters:
training_y = traindf.reset_index()['Y']
validation_y = traindf.reset_index()['Y']
Then create a dataset for this:
class YDataset(Dataset):
def __init__(self, data, seq_len = 100):
self.data = data
self.data = torch.from_numpy(data).float().view(-1)
self.seq_len = seq_len
def __len__(self):
return len(self.data)-self.seq_len-1
def __getitem__(self,index):
return self.data[index : index+self.seq_len] , self.data[index+self.seq_len]
train_y = YDataset(training_y_df)
vali_y = YDataset(validation_y_df)
batch_size = 64
train_dataloader = DataLoader(train_y, batch_size, drop_last=True)
vali_dataloader = DataLoader(vali_y, batch_size, drop_last=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
Then create the model:
class Lstm_model(nn.Module):
def __init__(self, input_dim, hidden_size, num_layers):
super(Lstm_model, self).__init__()
self.num_layers = num_layers
self.input_size = input_dim
self.hidden_size = hidden_size
self.lstm = nn.LSTM(input_size=input_dim, hidden_size = hidden_size, num_layers = num_layers)
self.fc = nn.Linear(hidden_size, 1)
def forward(self,x,hn,cn):
out , (hn,cn) = self.lstm(x, (hn, cn))
final_out = self.fc(out[-1])
return final_out, hn,cn
def predict(self,x):
hn, cn = self.init()
final_out = self.fc(out[-1])
return final_out
def init(self):
h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
return h0 , c0
input_dim = 1
hidden_size = 50
num_layers = 3
model = Lstm_model(input_dim , hidden_size , num_layers).to(device)
Loss function and training loop (more or less same as for validation):
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
def train(dataloader):
hn, cn = model.init()
model.train()
for batch , item in enumerate(dataloader):
x , y = item
x = x.to(device)
y = y.to(device)
out , hn , cn = model(x.reshape(100,batch_size,1),hn,cn)
loss = loss_fn(out.reshape(batch_size), y)
hn = hn.detach()
cn = cn.detach()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch == len(dataloader)-1:
loss = loss.item
print(f"train loss: {loss:>7f} ")
Epochs and loss metrics:
epochs = 200 # Takes really long for me
for epoch in range(epochs):
print(f"epoch {epoch} ")
train(train_dataloader)
test(vali_dataloader)
Final metrics:
import math
from sklearn.metrics import mean_squared_error
import numpy as np
def calculate_metrics(data_loader):
pred_arr = []
y_arr = []
with torch.no_grad():
hn , cn = model.init()
for batch , item = in enumerate(data_loader):
x , y = item
x , y = x.to(device) , y.to(device)
x = x.view(100,64,1)
pred = model(x, hn, cn)[0]
pred = scalar.inverse_transform(pred.detach().cpu().numpy().reshape(-1))
y = scalar.inverse_transform(y.detach().cpu().numpy().reshape(1,-1)).reshape(-1)
pred_arr = pred_arr + list(pred)
y_arr = y_arr + list(y)
return math.sqrt(mean_squared_error(y_arr,pred_arr))
I used this code more as an example of how LSTM would work. Nevertheless, I don't know if this is the right track for me. Does someone know what I should do or a tutorial that does work for my example? Thanks in advance!

Pytorch Parameterized Layer not Updated

I have a problem here, so I want to make a layer where the weight value (and the bias) is based on the other frozen weight. So, let’s say I have a frozen weight (FW) as a base value, then my current model layer will have weight W = FW + D, where D is the trainable parameter. Later, when I train the model, I hope the only parameter that gets updated is D.
I made this simple code for illustration:
frozen = nn.Linear(100,10)
frozen.weight.requires_grad = False
frozen.bias.requires_grad = False
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc = nn.Linear(100,10)
self.dw = nn.Parameter(torch.tensor(1.0, requires_grad=True))
self.db = nn.Parameter(torch.tensor(1.0, requires_grad=True))
def forward(self, x):
# the weight (and the bias) of fc layer is from FW and D
self.fc.weight = nn.Parameter(torch.add(frozen.weight, self.dw))
self.fc.bias = nn.Parameter(torch.add(frozen.bias, self.db))
return torch.sigmoid(self.fc(x))
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
x = torch.rand(100)
y = torch.tensor([0]*9+[1], dtype=torch.float32)
for _ in range(10):
out = model(x)
loss = criterion(out, y)
print(loss)
optimizer.zero_grad()
loss.backward()
optimizer.step()
But when I run that code, the model doesn’t train, and the self.dw and self.db doesn’t change. I am not sure whether my concept is wrong, so it’s not possible to train D, or I made a mistake in the implementation.
I also tried to implement using nn.utils.parameterize, but it still doesn’t work (I am new to using this, so I am not sure I implemented it correctly)
frozen = nn.Linear(100,10)
frozen.weight.requires_grad = False
frozen.bias.requires_grad = False
class Adder(nn.Module):
def __init__(self, delta, frozen):
super().__init__()
self.delta = nn.Parameter(torch.tensor(delta, requires_grad=True))
self.frozen=frozen
def forward(self, x):
return torch.add(self.frozen, self.delta)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc = nn.Linear(100,10)
def forward(self, x):
nn.utils.parametrize.register_parametrization(self.fc, "weight", Adder(1.0, frozen.weight))
nn.utils.parametrize.register_parametrization(self.fc, "bias", Adder(1.0, frozen.bias))
return torch.sigmoid(self.fc(x))
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
x = torch.rand(100)
y = torch.tensor([0]*9+[1], dtype=torch.float32)
for _ in range(10):
out = model(x)
loss = criterion(out, y)
print(loss)
optimizer.zero_grad()
loss.backward()
optimizer.step()
Thank you for any responses.
Instead of recreating new weight and bias by
self.fc.weight = nn.Parameter(torch.add(frozen.weight, self.dw))
self.fc.bias = nn.Parameter(torch.add(frozen.bias, self.db))
You can utilize nn.functional.linear and intermediate variables
weight = self.weight + frozen.weight
bias = self.bias + frozen.bias
F.linear(x, weight, bias)
Complete version:
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self, frozen):
super(Net, self).__init__()
self.weight = nn.Parameter(torch.ones(10, 100, dtype=torch.float32))
self.bias = nn.Parameter(torch.zeros(10, dtype=torch.float32))
self.frozen = frozen
#property
def weight_bias(self):
weight = self.weight + self.frozen.weight
bias = self.bias + self.frozen.bias
return weight, bias
def forward(self, x):
# the weight (and the bias) of fc layer is from FW and D
weight, bias = self.weight_bias
return F.linear(x, weight, bias) # this should return raw logits as required by nn.CrossEntropyLoss
frozen = nn.Linear(100, 10)
frozen.weight.requires_grad = False
frozen.bias.requires_grad = False
model = Net(frozen)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
x = torch.rand(100).unsqueeze(0)
y = torch.tensor([0]*9+[1], dtype=torch.float32).unsqueeze(0)
for _ in range(10):
out = model(x)
loss = criterion(out, y)
print(loss)
optimizer.zero_grad()
loss.backward()
optimizer.step()

Nested optimization in pytorch

I wrote a short snippet to train a classification model, and learn the learning rate of its optimization algorithm. In my example I tried to update weights of a network in an inner optimization loop and to learn the learning rate of the weight updates using an outer optimization loop (meta-optimization). I'm getting the error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 10]], which is output 0 of AsStridedBackward0, is at version 12; expected version 2 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
My code snippet is as following (NOTE: I'm using _stateless, an experimental functional API for nn. You need to run with the nightly build of pytorch.)
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import _stateless
class MyDataset(Dataset):
def __init__(self, N):
self.N = N
self.x = torch.rand(self.N, 10)
self.y = torch.randint(0, 3, (self.N,))
def __len__(self):
return self.N
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.fc1 = nn.Linear(10, 10)
self.fc2 = nn.Linear(10, 3)
self.relu = nn.ReLU()
self.alpha = nn.Parameter(torch.randn(1))
self.beta = nn.Parameter(torch.randn(1))
def forward(self, x):
y = self.relu(self.fc1(x))
return self.fc2(y)
epochs = 20
N = 100
dataset = DataLoader(dataset=MyDataset(N), batch_size=10)
model = MyModel()
loss_func = nn.CrossEntropyLoss()
optim = optim.Adam([model.alpha], lr=1e-3)
params = dict(model.named_parameters())
for i in range(epochs):
model.train()
train_loss = 0
for batch_idx, (x, y) in enumerate(dataset):
logits = _stateless.functional_call(model, params, x) # predict
loss_inner = loss_func(logits, y) # loss
optim.zero_grad() # reset grad
loss_inner.backward(create_graph=True, inputs=params.values()) # compute grad
train_loss += loss_inner.item() # store loss
for k, p in params.items():
if k is not 'alpha' and k is not 'beta':
p.update = - model.alpha * p.grad
params[k] = p + p.update # update weight
print('Train Epoch: {}\tLoss: {:.6f}'.format(i, train_loss / N))
logits = _stateless.functional_call(model, params, x) # predict
loss_meta = loss_func(logits, y)
loss_meta.backward()
loss_meta.step()
From the error message, I understand that the issue comes from weight update for the weights of the second layer of the network, which points to an error in my inner loop optimization. Any suggestions would be appreciated.
Check this link and save PARAMs per each epoch and use same inner batch:
https://discuss.pytorch.org/t/issue-using-parameters-internal-method/134549/11
for i in range(epochs):
model.train()
train_loss = 0
params = dict(model.named_parameters()) # add this
for batch_idx, (x, y) in enumerate(dataset):
params = {k: v.clone() for k,v in params.items()} # add this
logits = _stateless.functional_call(model, params, x) # predict
loss_inner = loss_func(logits, y)
..................
You should be updating params[k].data instead of params[k]
(Deleted the example to avoid distraction)
Let me enter in a kind of fundamental discussion (not an answer to your question).
If I undertand correctly you want to compute loss(f(w[i], x)) , and computing the w[i+1,j] = w[i,j] + g(v[j], w[i,j].grad(w.r.t loss)) . Then in the end you want to compute v[j+1] = v[j] + v[j].grad(w.r.t loss).
The gradient of v[j] is computed using the backward propagation, as a function of grad w[i,j]. So what you are trying to do is to choose v[j] that results in a good w[i,j]. I would ask: why would you bother about v[j] if you can control w[i,j] directly? And that's what the standard approach.

How to fix 'No gradients provided for any variable' error when using ctc_loss in Tensorflow

I am trying to make Baidu's Deep Speech 2 model in Tensorflow 2.0.0alpha0. I am having trouble optimizing the Tensorflow ctc_loss using a tf.GradientTape() object for calculating the gradients.
I am currently passing a tensor of shape (batch_size, max_step, feats) to my model and then passing the computed logits to the loss function. I have also tried passing a sparse tensor but this also does not work.
Here is the code for creating my model
import tensorflow as tf
class DeepSpeech2(tf.keras.Model):
def __init__(self, vocab_size, conv_filters=[11], conv_kernel_sizes=[1280], conv_strides=[2],
recur_sizes=[100], rnn_type='gru', bidirect_rnn=False, batch_norm=True,
learning_rate=1e-3, name='DeepSpeech2'):
super(DeepSpeech2, self).__init__()
self._vocab_size = vocab_size
self._conv_filters = conv_filters
self._conv_kernel_sizes = conv_kernel_sizes
self._conv_strides = conv_strides
self._recur_sizes = recur_sizes
self._rnn_type = rnn_type
self._bidirect_rnn = bidirect_rnn
self._batch_norm = batch_norm
self._learning_rate = learning_rate
self._name = name
self._conv_batch_norm = None
with tf.name_scope(self._name):
self._convolution = [tf.keras.layers.Conv1D(filters=conv_filters[i],
kernel_size=conv_kernel_sizes[i], strides=conv_strides[i],
padding='valid', activation='relu',
name='conv1d_{}'.format(i)) for i in range(len(self._conv_filters))]
if self._batch_norm:
self._conv_batch_norm = tf.keras.layers.BatchNormalization(name='bn_conv_1d')
if self._rnn_type == 'gru':
rnn_init = tf.keras.layers.GRU
elif self._rnn_type == 'lstm':
rnn_init = tf.keras.layers.LSTM
else:
raise Exception("Invalid rnn_type: '{}' (must be 'lstm' or 'gru')"
.format(self._rnn_type))
self._rnn = []
for i, r in enumerate(self._recur_sizes):
layer = rnn_init(r, activation='relu', return_sequences=True,
name='{}_{}'.format(self._rnn_type, i))
if self._bidirect_rnn:
layer = tf.keras.layers.Bidirectional(layer)
self._rnn.append(layer)
if self._batch_norm:
self._rnn.append(tf.keras.layers.BatchNormalization())
self._fc = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
self._vocab_size, name='fc', activation='linear'))
self._optimizer = tf.keras.optimizers.Adam(lr=self._learning_rate)
def __call__(self, specs):
with tf.name_scope(self._name):
feats = specs
for layer in self._convolution:
feats = layer(feats)
if self._conv_batch_norm:
feats = self._conv_batch_norm(feats)
rnn_outputs = feats
for layer in self._rnn:
rnn_outputs = layer(rnn_outputs)
outputs = self._fc(rnn_outputs)
return tf.transpose(outputs, (1, 0, 2))
#tf.function
def train_step(self, specs, spec_lengths, labels, label_lengths):
with tf.GradientTape() as tape:
logits = self.__call__(specs)
loss = tf.nn.ctc_loss(labels=labels, logits=logits,
label_length=label_lengths, logit_length=spec_lengths)
cost = tf.reduce_sum(loss)
decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(logits, label_lengths)
gradients = tape.gradient(cost, self.trainable_variables)
self._optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return (decoded[0].indices, decoded[0].values, decoded[0].dense_shape), cost
I am currently getting the following error
ValueError: No gradients provided for any variable: ['DeepSpeech2/conv1d_0/kernel:0', 'DeepSpeech2/conv1d_0/bias:0', 'DeepSpeech2/bn_conv_1d/gamma:0', 'DeepSpeech2/bn_conv_1d/beta:0', 'DeepSpeech2/gru_0/kernel:0', 'DeepSpeech2/gru_0/recurrent_kernel:0', 'DeepSpeech2/gru_0/bias:0', 'DeepSpeech2/batch_normalization_v2/gamma:0', 'DeepSpeech2/batch_normalization_v2/beta:0', 'DeepSpeech2/time_distributed/kernel:0', 'DeepSpeech2/time_distributed/bias:0'].
The error occurs at the line where the gradients are applied to the optimizer. When I print out my gradients variable, it is just a list of None
From what I understand, this error is indicating that there is no path from the variables to the loss in the graph but I'm not sure why I am getting this. Any help would be greatly appreciated!

Resources