I use PyTorch for training neural networks. While saving the model, the weights of the network are saved, while the activation functions are not captured. Now, I reload the model from the saved weights with the activation functions changed, the model load still does not throw error. Further, the network outputs incorrect values (obviously). Is there a way to save the structure of the neural network along with the weights? An MWE is presented below.
import torch
from torch import nn
class Test(nn.Module):
def __init__(self):
super(Test, self).__init__()
self.fc1 = nn.Linear(10, 25)
self.fc2 = nn.Linear(25, 10)
self.relu = nn.ReLU()
self.tanh = nn.Tanh()
def forward(self, inputs):
return self.tanh(self.fc2(self.relu(self.fc1(inputs))))
To save
test = Test().float()
torch.save(test.state_dict(), "test.pt")
To load
import torch
from torch import nn
class Test1(nn.Module):
def __init__(self):
super(Test, self).__init__()
self.fc1 = nn.Linear(10, 25)
self.fc2 = nn.Linear(25, 10)
self.relu = nn.ReLU()
self.tanh = nn.Tanh()
def forward(self, inputs):
return self.relu(self.fc2(self.tanh(self.fc1(inputs))))
test1 = Test1().float()
test1.load_state_dict(torch.load("test.pt")) # Loads without error. However the activation functions, tanh and relu are interchanged, and the network outputs incorrect values.
Is there a way to also capture the activation functions, while saving? Thanks.
I am getting the following error - RuntimeError: the derivative for ‘target’ is not implemented
I did have a look at similar posts however they are different from my problem. I’m trying to code an Auto Encoder from scratch. Here’s my code -
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch import nn
from torchviz import make_dot
trainset = torchvision.datasets.FashionMNIST(root='./data', train=True,
download=True)
data = trainset.data.float()
# print(trainset.data.shape)
# plt.imshow(trainset.data[9])
# plt.show()
device = "cuda" if torch.cuda.is_available() else "cpu"
data = data.to(device)
print(f"Using device = {device}")
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.encode = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 30),
nn.ReLU()
)
self.decode = nn.Sequential(
nn.Linear(30, 512),
nn.ReLU(),
nn.Linear(512, 28*28),
nn.ReLU()
)
def forward(self, x):
x = self.flatten(x)
encoded = self.encode(x)
decoded = self.decode(encoded)
return decoded
model = NeuralNetwork().to(device)
# print(model)
lossFn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3)
for epoch in range(1000):
optimizer.zero_grad()
outputs = model(data)
loss = lossFn(outputs, outputs)
loss.backward()
optimizer.step()
model.forward(data)
The reason you're having this error message is because you are calling a function which expects its argument to not require gradient computation. More specifically, nn.BCELoss expects the target (2nd argument) to not require gradient.
You can fix this by detaching the argument from the graph:
lossFn(outputs, outputs.detach())
I am experementing with different Convolutional Autoencoder Arcitectures now and I have decided to try pretrained ResnNet50 network as encoder in my model. I tried to options: use encoder without changing weights and use encoder using pretrained weights as initial. I had better results of reconstructing training weights of ResNet, but it still cannot outperform my basic CAE with 3 conv layer in encoder and 3 upsample + conv2d layers in decoder. Can you help me with the explanation of such effect? I am training my network on different domains of OfficeHome dataset. Here is my arcitecture:
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super(ConvBlock, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)
self.relu = nn.LeakyReLU(inplace=True)
self.norm = nn.BatchNorm2d(out_channels)
def forward(self, x):
x = self.conv(x)
x = self.relu(x)
x = self.norm(x)
return x
class ResnetConvAutoencoder(nn.Module):
def __init__(self):
super(ResnetConvAutoencoder, self).__init__()
resnet = models.resnet50(pretrained = True, progress = True)
# for param in resnet.parameters():
# param.requires_grad = False
self.encoder = torch.nn.Sequential(*(list(resnet.children())[:-2])) #output is 2048 x 8 x 8
del resnet
self.decoder = nn.Sequential(
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(2048, 512),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(512, 256),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(256, 128),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(128, 16),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(16, 3),
nn.Sigmoid()
) # output is 3 x 256 x 256 (as input)
self.encoder.cuda(2) # trainig on several GPUs
self.decoder.cuda(1)
def forward(self, x):
x = x.cuda(2)
x = self.encoder(x).cuda(1)
x = self.decoder(x).cuda(1)
return x
And some images examples of my simple CAE and CAE with ResNet encoder
Loss on train and test in CAE with ResNet-50 is about 0.03 and do not go lower. Loss in my CAE is lower than 0.015 (MSE).
I want to square the result of a maxpool layer.
I tried the following:
class CNNClassifier(Classifier): # nn.Module
def __init__(self, in_channels):
super().__init__()
self.save_hyperparameters('in_channels')
self.cnn = nn.Sequential(
# maxpool
nn.MaxPool2d((1, 5), stride=(1, 5)),
torch.square(),
# layer1
nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=5,
)
Which to the experienced PyTorch user for sure makes no sense.
Indeed, the error is quite clear:
TypeError: square() missing 1 required positional arguments: "input"
How can I feed in to square the tensor from the preceding layer?
You can't put a PyTorch function in a nn.Sequential pipeline, it needs to be a nn.Module.
You could wrap it like this:
class Square(nn.Module):
def forward(self, x):
return torch.square(x)
Then use it inside your sequential layer like so:
class CNNClassifier(Classifier): # nn.Module
def __init__(self, in_channels):
super().__init__()
self.save_hyperparameters('in_channels')
self.cnn = nn.Sequential(
nn.MaxPool2d((1, 5), stride=(1, 5)),
Square(),
nn.Conv2d(in_channels=in_channels, out_channels=32, kernel_size=5))
Trying to translate a simple LSTM model in Keras to PyTorch code. The Keras model converges after just 200 epochs, while the PyTorch model:
needs many more epochs to reach the same loss level (200 vs. ~8000)
seems to overfit the inputs because the predicted value is not near 100
This is the Keras code:
from numpy import array
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
X = array([10,20,30,20,30,40,30,40,50,40,50,60,50,60,70,60,70,80]).reshape((6,3,1))
y = array([40,50,60,70,80,90])
model = Sequential()
model.add(LSTM(50, activation='relu', recurrent_activation='sigmoid', input_shape=(3, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
model.fit(X, y, epochs=200, verbose=1)
x_input = array([70, 80, 90]).reshape((1, 3, 1))
yhat = model.predict(x_input, verbose=0)
print(yhat)
And this is the equivalent PyTorch code:
from numpy import array
import torch
import torch.nn as nn
import torch.nn.functional as F
X = torch.tensor([10,20,30,20,30,40,30,40,50,40,50,60,50,60,70,60,70,80]).float().reshape(6,3,1)
y = torch.tensor([40,50,60,70,80,90]).float().reshape(6,1)
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.lstm = nn.LSTM(input_size=1, hidden_size=50, num_layers=1, batch_first=True)
self.fc = nn.Linear(50, 1)
def forward(self, x):
batches = x.size(0)
h0 = torch.zeros([1, batches, 50])
c0 = torch.zeros([1, batches, 50])
(x, _) = self.lstm(x, (h0, c0))
x = x[:,-1,:] # Keep only the output of the last iteration. Before shape (6,3,50), after shape (6,50)
x = F.relu(x)
x = self.fc(x)
return x
model = Model()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())
n_epochs = 8000
for epoch in range(n_epochs):
model.train()
optimizer.zero_grad()
y_ = model(X)
loss = criterion(y_, y)
loss.backward()
optimizer.step()
print(f"Epoch {epoch+1}/{n_epochs}, loss = {loss.item()}")
model.eval()
x_input = torch.tensor([70, 80, 90]).float().reshape((1, 3, 1))
yhat = model(x_input)
print(yhat)
The only possible difference is the initial weight and bias values, but I don't think that slightly different weights and biases may account for such a big difference in behavior.
What am I missing in the PyTorch code?
The behaviour difference is because of the activation function in the LSTM API. By changing the activation to tanh, I can reproduce the problem in Keras too.
model.add(LSTM(50, activation='tanh', recurrent_activation='sigmoid', input_shape=(3, 1)))
There is no option to change the activation function to 'relu' in the pytorch LSTM API.
https://pytorch.org/docs/stable/nn.html#lstm
Taking the LSTM implementation from here, https://github.com/huggingface/torchMoji/blob/master/torchmoji/lstm.py
and changing hardsigmoid/tanh to sigmoid/relu, the model converges in pytorch as well.
I think you are initializing h0,c0 every time which is require at initial. So, better use the code below that i have modified. You can go through this link for RNN in pytorch: https://pytorch.org/docs/stable/nn.html?highlight=rnn#torch.nn.RNN
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.rnn = nn.RNN(input_size=1, hidden_size=50, num_layers=1, nonlinearity="relu", batch_first=True)
self.fc = nn.Linear(50, 1)
def forward(self, x):
# batches = x.size(0)
# h0 = torch.zeros([1, batches, 50])
# c0 = torch.zeros([1, batches, 50])
# (x, _) = self.lstm(x, (h0, c0))
(x, _) = self.rnn(x)
x = x[:,-1,:] # Keep only the output of the last iteration. Before shape (6,3,50), after shape (6,50)
x = F.relu(x)
x = self.fc(x)
return x
This gives good result of prediction within 2500 epochs.
I want to know why have you written below line of code and what is the purpose of it. So, that i can try to make it better.
x = x[:,-1,:] # Keep only the output of the last iteration. Before shape (6,3,50), after shape (6,50)