Activation gradient penalty - pytorch

Here's a simple neural network, where I’m trying to penalize the norm of activation gradients:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
self.pool = nn.MaxPool2d(2, 2)
self.relu = nn.ReLU()
self.linear = nn.Linear(64 * 5 * 5, 10)
def forward(self, input):
conv1 = self.conv1(input)
pool1 = self.pool(conv1)
self.relu1 = self.relu(pool1)
self.relu1.retain_grad()
conv2 = self.conv2(relu1)
pool2 = self.pool(conv2)
relu2 = self.relu(pool2)
self.relu2 = relu2.view(relu2.size(0), -1)
self.relu2.retain_grad()
return self.linear(relu2)
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
for i in range(1000):
output = model(input)
loss = nn.CrossEntropyLoss()(output, label)
optimizer.zero_grad()
loss.backward(retain_graph=True)
grads = torch.autograd.grad(loss, [model.relu1, model.relu2], create_graph=True)
grad_norm = 0
for grad in grads:
grad_norm += grad.pow(2).sum()
grad_norm.backward()
optimizer.step()
However, it does not produce the desired regularization effect. If I do the same thing for weights (instead of activations), it works well. Am I doing this right (in terms of pytorch machinery)? Specifically, what happens in grad_norm.backward() call? I just want to make sure the weight gradients are updated, and not activation gradients. Currently, when I print out gradients for weights and activations immediately before and after that line, both change - so I’m not sure what’s going on.

I think your code ends up computing some of the gradients twice in each step. I also suspect it actually never zeroes out the activation gradients, so they accumulate across steps.
In general:
x.backward() computes gradient of x wrt. computation graph leaves (e.g. weight tensors and other variables), as well as wrt. nodes explicitly marked with retain_grad(). It accumulates the computed gradient in tensors' .grad attributes.
autograd.grad(x, [y, z]) returns gradient of x wrt. y and z regardless of whether they would normally retain grad or not. By default, it will also accumulate gradient in all leaves' .grad attributes. You can prevent this by passing only_inputs=True.
I prefer to use backward() only for the optimization step, and autograd.grad() whenever my goal is to obtain "reified" gradients as intermediate values for another computation. This way, I can be sure that no unwanted gradients remain lying around in tensors' .grad attributes after I'm done with them.
import torch
from torch import nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
self.pool = nn.MaxPool2d(2, 2)
self.relu = nn.ReLU()
self.linear = nn.Linear(64 * 5 * 5, 10)
def forward(self, input):
conv1 = self.conv1(input)
pool1 = self.pool(conv1)
self.relu1 = self.relu(pool1)
conv2 = self.conv2(self.relu1)
pool2 = self.pool(conv2)
self.relu2 = self.relu(pool2)
relu2 = self.relu2.view(self.relu2.size(0), -1)
return self.linear(relu2)
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
grad_penalty_weight = 10.
for i in range(1000000):
# Random input and labels; we're not really learning anything
input = torch.rand(1, 3, 32, 32)
label = torch.randint(0, 10, (1,))
output = model(input)
loss = nn.CrossEntropyLoss()(output, label)
# This is where the activation gradients are computed
# only_inputs is optional here, since we're going to call optimizer.zero_grad() later
# But it makes clear that we're *only* interested in the activation gradients at this point
grads = torch.autograd.grad(loss, [model.relu1, model.relu2], create_graph=True, only_inputs=True)
grad_norm = 0
for grad in grads:
grad_norm += grad.pow(2).sum()
optimizer.zero_grad()
loss = loss + grad_norm * grad_penalty_weight
loss.backward()
optimizer.step()
This code appears to work, in that the activation gradients do get smaller.
I cannot comment on the viability of this technique as a regularization method.

Related

How to increase PyTorch's AI's accuracy in image classifier?

I am trying to build a powerful image classifier.
But I have an issue. I use CIFRAS-100 dataset, and I trained a model from it.
Issue here that the correct classificatons are equal to 15%.
I tried continuing learn process, but after 2-3 attempts, model has not changed.
Code that I used for training:
import torch
import sys,os
import torchvision
import torchvision.transforms as transforms
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
batch_size = 4
trainset = torchvision.datasets.CIFAR100(root='./dataone', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR100(root='./dataone', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=2)
classes = ('aquatic mammals','fish','flowers','food containers','fruit and vegetables','household electrical devices','household furniture','insects','large carnivores','large man-made outdoor things','large natural outdoor scenes','large omnivores and herbivores','medium-sized mammals','non-insect invertebrates','people','reptiles','small mammals','trees','vehicles 1','vehicles 2')
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 100)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
import torch.optim as optim
PATH = "./model.pt"
model = Net()
net = Net()
print(os.path.exists(PATH))
if os.path.exists(PATH):
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
print("using checkpoint")
#model.eval()
# - or -
model.train()
#criterion = nn.CrossEntropyLoss()
#optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
print("training..")
# print statistics
#running_loss += loss.item()
#if i % 2000 == 1999: # print every 2000 mini-batches
# print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
# running_loss = 0.0
print('Finished Training')
#PATH = './cifar_net.pth'
#torch.save(net.state_dict(), PATH)
EPOCH = 5
LOSS = 0.4
torch.save({
'epoch': EPOCH,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': LOSS,
}, PATH)```
It's based on PyTorch tutorial about image cassifiers, that can be found [here](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).
I took code for resuming training from [here.](https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html)
Code that I used for testing model:
import torch
import torchvision
import torchvision.transforms as transforms
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
batch_size = 4
trainset = torchvision.datasets.CIFAR100(root='./dataone', train=False,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR100(root='./dataone', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=2)
classes = ('aquatic mammals','fish','flowers','food containers','fruit and vegetables','household electrical devices','household furniture','insects','large carnivores','large man-made outdoor things','large natural outdoor scenes','large omnivores and herbivores','medium-sized mammals','non-insect invertebrates','people','reptiles','small mammals','trees','vehicles 1','vehicles 2')
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 100)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
PATH = './cifar_net.pth'
net.load_state_dict(torch.load(PATH))
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for data in testloader:
images, labels = data
# calculate outputs by running images through the network
outputs = net(images)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(correct)
print(total)
print(f'Accuracy of the network on the 100000 test images: {100 * correct // total} %')```
It's from the same image classifier tutorial by PyTorch. I added printing total and correct detected images for testing.
How can I increase accuracy, so it will be at least around 50-70%?
Or is this normal, and it means that these 15% are incorrect?
Please help.
Have you tried increasing the number of epochs? Training usually requires hundreds to thousands of iterations to obtain good results.
You could also improve the architecture by continuing the convolutional layers until you are left with a 1×1×N image where N is the number of filters in the final convolution. Then flatten and add linear layer(s). Batch Normalization and LeakyReLU activation before pooling layers may also help. Finally, you should use Softmax activation on the output since you are dealing with a classifier.
I highly recommend looking into popular classifiers such as VGG and ResNet. ResNet in particular has a feature called "residual/skip connections" that passes a copy of the output of a layer forward down the line to compensate for feature loss.
Could you provide accuracies and loss plots so we can understand better what is happening in the training (or maybe the list of accuracies and losses during training).
Also, it is a good practice to compute the validation accuracy and loss after every epoch to monitor the behaviour of the network on unseen data.
Although, as it has been said by Xynias, there are some improvements you could do on your architecture I believe the first step would be to investigate from the accuracies and losses.
Given CIFAR100 having 100 classes, this is expectable. You'll need a resonably complex network to perform well on this task. Definitely more feature maps, starting with 64 or more channels.
This Q&D architecture surpasses 50% overall accuracy after 10 epochs or so (using learning rate of 0.1 and batch size of 256, I also added RandomHorizontalFlip() transform):
class Net(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Conv2d(3, 128, 3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(128, 128, 3, stride=1, padding=1),
nn.ReLU(),
nn.AvgPool2d(2, 2),
nn.Conv2d(128, 256, 3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(256, 256, 3, stride=1, padding=1),
nn.ReLU(),
nn.AvgPool2d(2, 2),
nn.Flatten(),
nn.Dropout(0.5),
nn.Linear(16384, 100),
)
def forward(self, x):
return self.layers(x)
For a better result you may try implementing something ResNet-like, or utilize a premade (and possibly pretrained) model, for example, using timm:
import timm
net = timm.create_model('resnet18d', pretrained=True, num_classes=100)
It achieves your target metrics pretty fast with the same parameters as above.

CNN-LSTM for image sequences classification | high loss

I'm working on a project where I need to classify image sequences of some plants (growing over time). I tried implementing a CNN-LSTM with a pretrained ResNet18 as a feature extractor and then feeding those feature sequences to the LSTM.
The issue is that I'm not used to train LSTMs, and I'm afraid I'm doing something wrong. I made a clear architecture and everything seems ok, but the loss is not decreasing.
here's the architecture:
class RecurrentCNN(nn.Module):
def __init__(self, embed_dim, hidden_size, num_layers, num_classes):
super(RecurrentCNN, self).__init__()
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.num_layers = num_layers
self.num_classes = num_classes
self.cnn = torchvision.models.resnet18(weights='DEFAULT')
self.cnn.fc = nn.Sequential(
nn.Linear(in_features=512, out_features=self.embed_dim, bias=False),
nn.BatchNorm1d(num_features=self.embed_dim)
)
self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
self.fc = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.BatchNorm1d(num_features=hidden_size),
nn.Dropout(0.2),
nn.Linear(hidden_size, num_classes)
)
def forward(self, x):
batch_size, img_size = x.shape[0], x.shape[2:]
x = x.reshape(-1, *img_size) # i merge the batch_size and num_seq in order to feed everything to the cnn
x = self.cnn(x)
x = x.reshape(batch_size, -1, self.embed_dim) # then i comeback the original shape
# lstm part
h_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
c_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
x, (hn, cn) = self.lstm(x, (h_0, c_0))
x = x[:, -1, :]
x = self.fc(x)
return x
I have 40 classes to output. My sequences are of different lengths, so I was forced to pad with some black images sometimes! (mean seq length: 39, max: 55, min: 15)
I'm feeding the model with sequences of shape (batch_size, seq_len=55, 3, 112, 112).
It may be wrong but for now I just want to make sure that the model is at least working correctly, then I'll probably change the strategy of learning.
here's the training code:
EPOCHS = 10
BATCH_SIZE = 4
dataset = PlantDataset(data_path, max_sequence_len=55, transform=None)
train_loader = torch.utils.data.DataLoader(
dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True
)
rcnn = RecurrentCNN(embed_dim=128, hidden_size=256, num_layers=2, num_classes=len(class_list)).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(rcnn.parameters(), lr=0.0001)
loss_am = list() #AverageMeter()
rcnn.train()
for epoch in range(EPOCHS):
progress = tqdm(range(dataset.__len__() * BATCH_SIZE))
for i, data in enumerate(train_loader):
optimizer.zero_grad()
sequences, targets = data
sequences, targets = sequences.to(device, dtype=torch.float), torch.Tensor(targets).to(device)
output = torch.nn.functional.log_softmax(rcnn(sequences), dim=1)
loss_value = criterion(output, targets)
loss_value.backward()
optimizer.step()
with torch.no_grad():
loss_am.append(loss_value.item())
progress.update(i)
progress.set_description('Epoch: {}, Loss: {:.4f}'.format(epoch, loss_value.item()))
progress.close()
The loss on each batch goes like
3.53 => 4.22 => 4.62 => 3.83 => 3.75 => 3.80 => 3.70, etc
Do you have any idea ?
I am facing the same issue. But I am able to find the problem. Since I am using the Image-sequences dataset, my model is not able to predict the tokens, instead, I ended up with a whole set of garbage tokens. I am still trying to figure out why this is happening.

Expected input batch_size (18) to match target batch_size (6)

Is RNN for image classification available only for gray image?
The following program works for gray image classification.
If RGB images are used, I have this error:
Expected input batch_size (18) to match target batch_size (6)
at this line loss = criterion(outputs, labels).
My data loading for train, valid and test are as follows.
input_size = 300
inputH = 300
inputW = 300
#Data transform (normalization & data augmentation)
stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_resize_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
train_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.RandomHorizontalFlip(),
tt.ToTensor(),
tt.Normalize(*stats)])
valid_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
test_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
#Create dataset
train_ds = ImageFolder('./data/train', train_tfms)
valid_ds = ImageFolder('./data/valid', valid_tfms)
test_ds = ImageFolder('./data/test', test_tfms)
from torch.utils.data.dataloader import DataLoader
batch_size = 6
#Training data loader
train_dl = DataLoader(train_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Validation data loader
valid_dl = DataLoader(valid_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Test data loader
test_dl = DataLoader(test_ds, 1, shuffle = False, num_workers = 1, pin_memory=True)
My model is as follows.
num_steps = 300
hidden_size = 256 #size of hidden layers
num_classes = 5
num_epochs = 20
learning_rate = 0.001
# Fully connected neural network with one hidden layer
num_layers = 2 # 2 RNN layers are stacked
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)#batch must have first dimension
#our inpyt needs to have shape
#x -> (batch_size, seq, input_size)
self.fc = nn.Linear(hidden_size, num_classes)#this fc is after RNN. So needs the last hidden size of RNN
def forward(self, x):
#according to ducumentation of RNN in pytorch
#rnn needs input, h_0 for inputs at RNN (h_0 is initial hidden state)
#the following one is initial hidden layer
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)#first one is number of layers and second one is batch size
#output has two outputs. The first tensor contains the output features of the hidden last layer for all time steps
#the second one is hidden state f
out, _ = self.rnn(x, h0)
#output has batch_size, num_steps, hidden size
#we need to decode hidden state only the last time step
#out (N, 30, 128)
#Since we need only the last time step
#Out (N, 128)
out = out[:, -1, :] #-1 for last time step, take all for N and 128
out = self.fc(out)
return out
stacked_rnn_model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()#cross entropy has softmax at output
#optimizer = torch.optim.Adam(stacked_rnn_model.parameters(), lr=learning_rate) #optimizer used gradient optimization using Adam
optimizer = torch.optim.SGD(stacked_rnn_model.parameters(), lr=learning_rate)
# Train the model
n_total_steps = len(train_dl)
for epoch in range(num_epochs):
t_losses=[]
for i, (images, labels) in enumerate(train_dl):
# origin shape: [6, 3, 300, 300]
# resized: [6, 300, 300]
images = images.reshape(-1, num_steps, input_size).to(device)
print('images shape')
print(images.shape)
labels = labels.to(device)
# Forward pass
outputs = stacked_rnn_model(images)
print('outputs shape')
print(outputs.shape)
loss = criterion(outputs, labels)
t_losses.append(loss)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
Printing images and outputs shapes are
images shape
torch.Size([18, 300, 300])
outputs shape
torch.Size([18, 5])
Where is the mistake?
Tl;dr: You are flattening the first two axes, namely batch and channels.
I am not sure you are taking the right approach but I will write about that layer.
In any case, let's look at the issue you are facing. You have a data loader that produces (6, 3, 300, 300), i.e. batches of 6 three-channel 300x300 images. By the look of it you are looking to reshape each batch element (3, 300, 300) into (step_size=300, -1).
However instead of that you are affecting the first axis - which you shouldn't - with images.reshape(-1, num_steps, input_size). This will have the desired effect when working with a single-channel images since dim=1 wouldn't be the "channel axis". In your case your have 3 channels, therefore, the resulting shape is: (6*3*300*300//300//300, 300, 300) which is (18, 300, 300) since num_steps=300 and input_size=300. As a result you are left with 18 batch elements instead of 6.
Instead what you want is to reshape with (batch_size, num_steps, -1). Leaving the last axis (a.k.a. seq_length) of variable size. This will result in a shape (6, 300, 900).
Here is a corrected and reduced snippet:
batch_size = 6
channels = 3
inputH, inputW = 300, 300
train_ds = TensorDataset(torch.rand(100, 3, inputH, inputW), torch.rand(100, 5))
train_dl = DataLoader(train_ds, batch_size)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
# (batch_size, seq, input_size)
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
# (batch_size, hidden_size)
self.fc = nn.Linear(hidden_size, num_classes)
# (batch_size, num_classes)
def forward(self, x):
out, _ = self.rnn(x)
out = out[:, -1, :]
out = self.fc(out)
return out
num_steps = 300
input_size = inputH*inputW*channels//num_steps
hidden_size = 256
num_classes = 5
num_layers = 2
rnn = RNN(input_size, hidden_size, num_layers, num_classes)
for x, y in train_dl:
print(x.shape, y.shape)
images = images.reshape(batch_size, num_steps, -1)
print(images.shape)
outputs = rnn(images)
print(outputs.shape)
break
As I said in the beginning I am a bit wary about this approach because you are essentially feeding your RNN a RGB 300x300 image in the form of a sequence of 300 flattened vectors... I can't say if that makes sense and terms of training and if the model will be able to learn from that. I could be wrong!

Measuring uncertainty using MC Dropout on pytorch

I am trying to implement Bayesian CNN using Mc Dropout on Pytorch,
the main idea is that by applying dropout at test time and running over many forward passes , you get predictions from a variety of different models.
I’ve found an application of the Mc Dropout and I really did not get how they applied this method and how exactly they did choose the correct prediction from the list of predictions
here is the code
def mcdropout_test(model):
model.train()
test_loss = 0
correct = 0
T = 100
for data, target in test_loader:
if args.cuda:
data, target = data.cuda(), target.cuda()
data, target = Variable(data, volatile=True), Variable(target)
output_list = []
for i in xrange(T):
output_list.append(torch.unsqueeze(model(data), 0))
output_mean = torch.cat(output_list, 0).mean(0)
test_loss += F.nll_loss(F.log_softmax(output_mean), target, size_average=False).data[0] # sum up batch loss
pred = output_mean.data.max(1, keepdim=True)[1] # get the index of the max log-probability
correct += pred.eq(target.data.view_as(pred)).cpu().sum()
test_loss /= len(test_loader.dataset)
print('\nMC Dropout Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
train()
mcdropout_test()
I have replaced
data, target = Variable(data, volatile=True), Variable(target)
by adding
with torch.no_grad(): at the beginning
And this is how I have defined my CNN
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 192, 5, padding=2)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(192, 192, 5, padding=2)
self.fc1 = nn.Linear(192 * 8 * 8, 1024)
self.fc2 = nn.Linear(1024, 256)
self.fc3 = nn.Linear(256, 10)
self.dropout = nn.Dropout(p=0.3)
nn.init.xavier_uniform_(self.conv1.weight)
nn.init.constant_(self.conv1.bias, 0.0)
nn.init.xavier_uniform_(self.conv2.weight)
nn.init.constant_(self.conv2.bias, 0.0)
nn.init.xavier_uniform_(self.fc1.weight)
nn.init.constant_(self.fc1.bias, 0.0)
nn.init.xavier_uniform_(self.fc2.weight)
nn.init.constant_(self.fc2.bias, 0.0)
nn.init.xavier_uniform_(self.fc3.weight)
nn.init.constant_(self.fc3.bias, 0.0)
def forward(self, x):
x = self.pool(F.relu(self.dropout(self.conv1(x)))) # recommended to add the relu
x = self.pool(F.relu(self.dropout(self.conv2(x)))) # recommended to add the relu
x = x.view(-1, 192 * 8 * 8)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(self.dropout(x)))
x = self.fc3(self.dropout(x)) # no activation function needed for the last layer
return x
Can anyone help me to get the right implementation of the Monte Carlo Dropout method on CNN?
Implementing MC Dropout in Pytorch is easy. All that is needed to be done is to set the dropout layers of your model to train mode. This allows for different dropout masks to be used during the different various forward passes. Below is an implementation of MC Dropout in Pytorch illustrating how multiple predictions from the various forward passes are stacked together and used for computing different uncertainty metrics.
import sys
import numpy as np
import torch
import torch.nn as nn
def enable_dropout(model):
""" Function to enable the dropout layers during test-time """
for m in model.modules():
if m.__class__.__name__.startswith('Dropout'):
m.train()
def get_monte_carlo_predictions(data_loader,
forward_passes,
model,
n_classes,
n_samples):
""" Function to get the monte-carlo samples and uncertainty estimates
through multiple forward passes
Parameters
----------
data_loader : object
data loader object from the data loader module
forward_passes : int
number of monte-carlo samples/forward passes
model : object
keras model
n_classes : int
number of classes in the dataset
n_samples : int
number of samples in the test set
"""
dropout_predictions = np.empty((0, n_samples, n_classes))
softmax = nn.Softmax(dim=1)
for i in range(forward_passes):
predictions = np.empty((0, n_classes))
model.eval()
enable_dropout(model)
for i, (image, label) in enumerate(data_loader):
image = image.to(torch.device('cuda'))
with torch.no_grad():
output = model(image)
output = softmax(output) # shape (n_samples, n_classes)
predictions = np.vstack((predictions, output.cpu().numpy()))
dropout_predictions = np.vstack((dropout_predictions,
predictions[np.newaxis, :, :]))
# dropout predictions - shape (forward_passes, n_samples, n_classes)
# Calculating mean across multiple MCD forward passes
mean = np.mean(dropout_predictions, axis=0) # shape (n_samples, n_classes)
# Calculating variance across multiple MCD forward passes
variance = np.var(dropout_predictions, axis=0) # shape (n_samples, n_classes)
epsilon = sys.float_info.min
# Calculating entropy across multiple MCD forward passes
entropy = -np.sum(mean*np.log(mean + epsilon), axis=-1) # shape (n_samples,)
# Calculating mutual information across multiple MCD forward passes
mutual_info = entropy - np.mean(np.sum(-dropout_predictions*np.log(dropout_predictions + epsilon),
axis=-1), axis=0) # shape (n_samples,)
Moving on to the implementation which is posted in the question above, multiple predictions from T different forward passes are obtained by first setting the model to train mode (model.train()). Note that this is not desirable because unwanted stochasticity will be introduced in the predictions if there are layers other than dropout such as batch-norm in the model. Hence the best way is to just set the dropout layers to train mode as shown in the snippet above.

Why are Pytorch and Keras implementations giving vastly different results?

I am trying to train a 1-D ConvNet for time series classification as shown in this paper (refer to FCN om Fig. 1b) https://arxiv.org/pdf/1611.06455.pdf
The Keras implementation is giving me vastly superior performance. Could someone explain why is that the case?
The code for Pytorch is as follow:
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv1d(x_train.shape[1], 128, 8)
self.bnorm1 = nn.BatchNorm1d(128)
self.conv2 = nn.Conv1d(128, 256, 5)
self.bnorm2 = nn.BatchNorm1d(256)
self.conv3 = nn.Conv1d(256, 128, 3)
self.bnorm3 = nn.BatchNorm1d(128)
self.dense = nn.Linear(128, nb_classes)
def forward(self, x):
c1=self.conv1(x)
b1 = F.relu(self.bnorm1(c1))
c2=self.conv2(b1)
b2 = F.relu(self.bnorm2(c2))
c3=self.conv3(b2)
b3 = F.relu(self.bnorm3(c3))
output = torch.mean(b3, 2)
dense1=self.dense(output)
return F.softmax(dense1)
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.5, momentum=0.99)
losses=[]
for t in range(1000):
y_pred_1= model(x_train.float())
loss_1 = criterion(y_pred_1, y_train.long())
print(t, loss_1.item())
optimizer.zero_grad()
loss_1.backward()
optimizer.step()
For comparison, I use the following code for Keras:
x = keras.layers.Input(x_train.shape[1:])
conv1 = keras.layers.Conv1D(128, 8, padding='valid')(x)
conv1 = keras.layers.BatchNormalization()(conv1)
conv1 = keras.layers.Activation('relu')(conv1)
conv2 = keras.layers.Conv1D(256, 5, padding='valid')(conv1)
conv2 = keras.layers.BatchNormalization()(conv2)
conv2 = keras.layers.Activation('relu')(conv2)
conv3 = keras.layers.Conv1D(128, 3, padding='valid')(conv2)
conv3 = keras.layers.BatchNormalization()(conv3)
conv3 = keras.layers.Activation('relu')(conv3)
full = keras.layers.GlobalAveragePooling1D()(conv3)
out = keras.layers.Dense(nb_classes, activation='softmax')(full)
model = keras.models.Model(inputs=x, outputs=out)
optimizer = keras.optimizers.SGD(lr=0.5, decay=0.0, momentum=0.99)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
hist = model.fit(x_train, Y_train, batch_size=x_train.shape[0], nb_epoch=2000)
The only difference I see between the two is the initialization but however, the results are just vastly different. For reference, I use the same preprocessing as follows for both the datasets, with a subtle difference in input shapes, for Pytorch (Batch_Size, Channels, Length) and for Keras: (Batch_Size, Length, Channels).
The reason of different results is due to different default parameters of layers and optimizer. For example in pytorch decay-rate of batch-norm is considered as 0.9, whereas in keras it is 0.99. Like that, there may be other variation in default parameters.
If you use same parameters and fixed random seed for initialization, there won't be much difference in the result for both library.

Resources