I use PyTorch for training neural networks. While saving the model, the weights of the network are saved, while the activation functions are not captured. Now, I reload the model from the saved weights with the activation functions changed, the model load still does not throw error. Further, the network outputs incorrect values (obviously). Is there a way to save the structure of the neural network along with the weights? An MWE is presented below.
import torch
from torch import nn
class Test(nn.Module):
def __init__(self):
super(Test, self).__init__()
self.fc1 = nn.Linear(10, 25)
self.fc2 = nn.Linear(25, 10)
self.relu = nn.ReLU()
self.tanh = nn.Tanh()
def forward(self, inputs):
return self.tanh(self.fc2(self.relu(self.fc1(inputs))))
To save
test = Test().float()
torch.save(test.state_dict(), "test.pt")
To load
import torch
from torch import nn
class Test1(nn.Module):
def __init__(self):
super(Test, self).__init__()
self.fc1 = nn.Linear(10, 25)
self.fc2 = nn.Linear(25, 10)
self.relu = nn.ReLU()
self.tanh = nn.Tanh()
def forward(self, inputs):
return self.relu(self.fc2(self.tanh(self.fc1(inputs))))
test1 = Test1().float()
test1.load_state_dict(torch.load("test.pt")) # Loads without error. However the activation functions, tanh and relu are interchanged, and the network outputs incorrect values.
Is there a way to also capture the activation functions, while saving? Thanks.
Related
Here is a simple example that results in an in-place operation error.
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=3,
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
shared_net = SharedNetwork()
net_1 = Network1()
segmentor = Network2()
optimizer = optim.Adam(list(shared_net.parameters()) + list(segmentor.parameters()), lr=1e-6)
optimizer_conf = optim.Adam(list(shared_net.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
optimizer.zero_grad()
optimizer_conf.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
optimizer.step()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
Error message:
UserWarning: Error detected in ConvolutionBackward0. No forward pass information available. Enable detect anomaly during forward pass for more information. (Triggered internally at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:97.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 3, 3, 3]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I was able to solve the problem by changing the order of running the step function of optimizers.
optimizer_conf.zero_grad()
optimizer.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
optimizer.step()
The following questions, however, remain:
How does the step method cause an inplace operation in convolution?
Why does moving the steps to the end of the file resolve this error?
NOTE: The loss function is used for simplicity, using dice-loss also results in the same error!
Before answering the question, I have to mention that it seems having multiple optimizers for one set of parameters is anti-pattern and it's better to be avoided.
How does the step method cause an inplace operation in convolution?
A: step method adds the gradients to the weights, so it does something like the following:
param.weight += param.grad
which can be interpreted as an in place operation
Why does moving the steps to the end of the file resolve this error?
A: Obviously, by moving the step method after the second backward method, the above-mentioned operation is not executed. As a result, there are no in-place operations and no errors raised due to their existence.
To sum up, it's best to have only one optimizer for one set of parameters, the previous example could coded in the following way:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=(3,3),
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
torch.manual_seed(0)
shared_net = SharedNetwork()
net_1 = Network1()
net_2 = Network2()
shared_optimizer = optim.Adam(list(shared_net.parameters()), lr=1e-6)
net_1_optimizer = optim.Adam(list(net_1.parameters()), lr=1e-6)
net_2_optimizer = optim.Adam(list(segmentor.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
net_2_optimizer.zero_grad()
features = shared_net(fake_data)
net_2_out = net_2(features)
s_loss = loss_fn(net_2_out, target_data_2)
s_loss.backward(retain_graph=True)
net_2_optimizer.step()
net_1_optimizer.zero_grad()
shared_optimizer.zero_grad()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
net_1_optimizer.step()
shared_optimizer.step()
Note: If you want to have two different learning rates for different losses applied to one set of parameters, you can multiply the losses based on their importance by a value. For example, you can multiply loss_1 by 0.1 and loss_1 by 0.5. Or, you can use backward hooks as mentioned in this comment:
backward-hook
Says I import these libraries
import torch.nn as nn
import torch.optim as optim # step
import torch.nn.functional as F # all activation f. like ReLU
Suppose I need to construct Logistic regression model with output unit > 1 (as I want to do multi-label classification)
So, I found two options (a pair of nn.module and criterion) to construct them, which cause me so confused which set to choose
# SET 1
class LogisticReg(nn.Module):
def __init__(self, input_size, num_classes): # sequence of layer (768,50,10)
super(LogisticReg, self).__init__()
self.fc1 = nn.Linear(input_size, num_classes) # 768 * 1024
def forward(self, x):
x = self.fc1(x)
x = F.sigmoid(x)
return x
criterion = nn.BCELoss()
# SET 2
class LogisticReg(nn.Module):
def __init__(self, input_size, num_classes): # sequence of layer (768,50,10)
super(LogisticReg, self).__init__()
self.fc1 = nn.Linear(input_size, num_classes) # 768 * 1024
def forward(self, x):
x = self.fc1(x)
return x
criterion = nn.BCEWithLogitsLoss()
The only difference is that when nn.module of Logistic regression does not have sigmoid activation at .forward then it must pair nn.BCEwithLogitLoss instead of nn.BCELoss, since it does not got activated by Sigmoid yet
Hence, I have 2 questions here
Are these 2 sets exactly equivalent?
If yes, then what is the point of Pytorch having both criterion BCELoss and BCEwithLogitLoss
I am getting the following error - RuntimeError: the derivative for ‘target’ is not implemented
I did have a look at similar posts however they are different from my problem. I’m trying to code an Auto Encoder from scratch. Here’s my code -
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch import nn
from torchviz import make_dot
trainset = torchvision.datasets.FashionMNIST(root='./data', train=True,
download=True)
data = trainset.data.float()
# print(trainset.data.shape)
# plt.imshow(trainset.data[9])
# plt.show()
device = "cuda" if torch.cuda.is_available() else "cpu"
data = data.to(device)
print(f"Using device = {device}")
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.encode = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 30),
nn.ReLU()
)
self.decode = nn.Sequential(
nn.Linear(30, 512),
nn.ReLU(),
nn.Linear(512, 28*28),
nn.ReLU()
)
def forward(self, x):
x = self.flatten(x)
encoded = self.encode(x)
decoded = self.decode(encoded)
return decoded
model = NeuralNetwork().to(device)
# print(model)
lossFn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3)
for epoch in range(1000):
optimizer.zero_grad()
outputs = model(data)
loss = lossFn(outputs, outputs)
loss.backward()
optimizer.step()
model.forward(data)
The reason you're having this error message is because you are calling a function which expects its argument to not require gradient computation. More specifically, nn.BCELoss expects the target (2nd argument) to not require gradient.
You can fix this by detaching the argument from the graph:
lossFn(outputs, outputs.detach())
I have two Neural networks Child1 and Child2. The output of Child1 is fed to the child2. The output of child2 is fed to the child1 again. Each one has their respective cost function. I have created a parent module to pack both the modules in a single module. I have done this because I have a requirement that the back propagation(derivative calculation) of each of the loss function of the neural networks(child1 and child2) should be done with respect to the weights of both the neural networks combined(child1 + child2). Could I do it this way? Could I backpropagate through my Parent module and get the gradient of each of the loss functions with repect to the combined weights?
class Parent(nn.Module):
def __init__(self,in_features,z_dim):
super().__init__()
self.my_child1 = Child1(z_dim)
self.my_child2 = Child2 (in_features)
def forward(self,input):
input=self.my_child1(input)
input=self.my_child2(input)
return input
class Child1(nn.Module):
def __init__(self, in_features):
super().__init__()
self.child1 = nn.Sequential(
nn.Linear(in_features, 128),
nn.LeakyReLU(0.01),
nn.Linear(128, 1),
nn.Sigmoid(),
)
def forward(self, x):
return self.child1(x)
class Child2(nn.Module):
def __init__(self, z_dim, img_dim):
super().__init__()
self.child2 = nn.Sequential(
nn.Linear(z_dim, 256),
nn.LeakyReLU(0.01),
nn.Linear(256, img_dim),
nn.Tanh(),
)
def forward(self, x):
return self.child2(x)
In this notebook https://nbviewer.jupyter.org/github/krasserm/bayesian-machine-learning/blob/master/bayesian_neural_networks.ipynb, the author defines the function
def mixture_prior_params(sigma_1, sigma_2, pi):
params = K.variable([sigma_1, sigma_2, pi], name='mixture_prior_params')
sigma = np.sqrt(pi * sigma_1 ** 2 + (1 - pi) * sigma_2 ** 2)
return params, sigma
which creates a variable and returns a tuple. This method is then called
prior_params, prior_sigma = mixture_prior_params(sigma_1=1.0, sigma_2=0.1, pi=0.2)
Then, in the class DenseVariational, which is a custom layer, in the method build, the prior_params global variable is added to the private list _trainable_weights
def build(self, input_shape):
self._trainable_weights.append(prior_params)
...
Why would one need or want to do this? If I attempt to print the trainable parameters of either the custom layer or a model made of this custom layer, for example
# Create the model with DenseVariational layers
model = Model(x_in, x_out)
print("model.trainable_weights =", model.trainable_weights)
I can see that each DenseVariational layer contains a mixture_prior_params trainable parameter. Why should one declare mixture_prior_params, more specifically, sigma_1, sigma_2 and pi, outside of the layer, if they are trainable parameters of the layer?
After having looked at this question Can I share weights between keras layers but have other parameters differ? and its answer (https://stackoverflow.com/a/45258859/3924118) and having printed the values of the trainable variables of the model after the model has been trained, it seems like this is a way of sharing a variable across different layers, given that the value of that variable seems to be equal across layers, after the model has been trained.
I have created a simple example (with TensorFlow 2.0.0 and Keras 2.3.1) that shows this
import numpy as np
from keras import activations, initializers
from keras import backend as K
from keras import optimizers
from keras.layers import Input
from keras.layers import Layer
from keras.models import Model
shared_variable = K.variable([0.3], name='my_shared_variable')
class MyLayer(Layer):
def __init__(self, output_dim, activation=None, **kwargs):
self.output_dim = output_dim
self.activation = activations.get(activation)
super().__init__(**kwargs)
def build(self, input_shape):
self._trainable_weights.append(shared_variable)
self.my_weight = self.add_weight(name='my_weight',
shape=(input_shape[1], self.output_dim),
initializer=initializers.normal(),
trainable=True)
super().build(input_shape)
def call(self, x):
return self.activation(K.dot(x, self.my_weight * shared_variable))
def compute_output_shape(self, input_shape):
return input_shape[0], self.output_dim
if __name__ == "__main__":
# Define the architecture of the model.
x_in = Input(shape=(1,))
h1 = MyLayer(20, activation='relu')(x_in)
h2 = MyLayer(20, activation='relu')(h1)
x_out = MyLayer(1)(h2)
model = Model(x_in, x_out)
print("h1.trainable_weights (before training) =", model.layers[1].trainable_weights[0])
print("h2.trainable_weights (before training) =", model.layers[2].trainable_weights[0])
# Prepare the model for training.
model.compile(loss="mse", optimizer=optimizers.Adam(lr=0.03))
# Generate dataset.
X = np.linspace(-0.5, 0.5, 100).reshape(-1, 1)
y = 10 * np.sin(2 * np.pi * X)
# Train the model.
model.fit(X, y, batch_size=1, epochs=100, verbose=0)
print("h1.trainable_weights (after training) =", model.layers[1].trainable_weights[0])
print("h2.trainable_weights (after training) =", model.layers[2].trainable_weights[0])
The output is
h1.trainable_weights (before training) = <tf.Variable 'my_shared_variable:0' shape=(1,) dtype=float32, numpy=array([0.3], dtype=float32)>
h2.trainable_weights (before training) = <tf.Variable 'my_shared_variable:0' shape=(1,) dtype=float32, numpy=array([0.3], dtype=float32)>
h1.trainable_weights (after training) = <tf.Variable 'my_shared_variable:0' shape=(1,) dtype=float32, numpy=array([0.7049409], dtype=float32)>
h2.trainable_weights (after training) = <tf.Variable 'my_shared_variable:0' shape=(1,) dtype=float32, numpy=array([0.7049409], dtype=float32)>