pytorch CNN get label for a single image - pytorch

I'm getting stuck on a function that is supposed to predict the label of a single image. I need to do this on a single image because I want to build a web app, where the user can upload an image and can get its prediction.
My CNN is the following with the base for the model :
class ImageClassificationBase(nn.Module):
def training_step(self, batch):
images, labels = batch
out = self(images) # Generate predictions
loss = F.cross_entropy(out, labels) # Calculate loss
return loss
def validation_step(self, batch):
images, labels = batch
out = self(images) # Generate predictions
loss = F.cross_entropy(out, labels) # Calculate loss
acc = accuracy(out, labels) # Calculate accuracy
return {'val_loss': loss.detach(), 'val_acc': acc}
def validation_epoch_end(self, outputs):
batch_losses = [x['val_loss'] for x in outputs]
epoch_loss = torch.stack(batch_losses).mean() # Combine losses
batch_accs = [x['val_acc'] for x in outputs]
epoch_acc = torch.stack(batch_accs).mean() # Combine accuracies
return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
epoch, result['train_loss'], result['val_loss'], result['val_acc']))
and the model itself:
class BrainTumorClassification(ImageClassificationBase):
def __init__(self):
super().__init__()
self.network = nn.Sequential(
nn.Conv2d(3, 32, kernel_size = 3, padding = 1),
nn.ReLU(),
nn.Conv2d(32,64, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Conv2d(64, 128, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.Conv2d(128 ,128, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Conv2d(128, 256, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.Conv2d(256,256, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Flatten(),
nn.Linear(82944,1024),
nn.ReLU(),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Linear(512,6))
def forward(self, xb):
return self.network(xb)
The function I'm trying to implement for testing a single image is the following:
from torch.autograd import Variable
transformer = transforms.Compose([
transforms.Resize((150,150)), transforms.ToTensor()])
def classify(image_path,image_transforms, classes):
image = Image.open(image_path)
image_tensor = image_transforms(image).float()
image_tensor = image_tensor.unsqueeze_(0)
input = Variable(image_tensor)
output = model(input)
index = output.data.numpy().argmax()
pred = classes[index]
return pred
I'm getting an error:
`pred=classes[index]` index out of range
I should mention that classes has 4 elements : ['glioma_tumor', 'meningioma_tumor', 'no_tumor', 'pituitary_tumor'].

A few points to note:
Don't forget to load your trained network on your initialized model.
Variable has been deprecated, you should not use it. Gradients are tracked on tensors that have the requires_grad flag on. Here you are only inferring so you can actually use the torch.no_grad context to avoid retaining parameter activations. This will increase inference speed.
torch.Tensor.unsqueeze_, you don't have to reassign the result as the input itself is modified by the function. As a general note, all torch.Tensor functions with a _ suffix are in-place operators.
Most of all, you mentioned only having 4 classes, yet your last fully connected layer outputs 6 logits. In this case, you need to change this to 4.
Here is a possible modification:
transformer = transforms.Compose([transforms.Resize((150,150)),
transforms.ToTensor()])
#torch.no_grad()
def classify(image_path,image_transforms, classes):
image = Image.open(image_path)
image_tensor = image_transforms(image)
image_tensor.unsqueeze_(0)
output = model(image_tensor)
index = output.data.numpy().argmax()
pred = classes[index]
return pred

Related

How to increase PyTorch's AI's accuracy in image classifier?

I am trying to build a powerful image classifier.
But I have an issue. I use CIFRAS-100 dataset, and I trained a model from it.
Issue here that the correct classificatons are equal to 15%.
I tried continuing learn process, but after 2-3 attempts, model has not changed.
Code that I used for training:
import torch
import sys,os
import torchvision
import torchvision.transforms as transforms
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
batch_size = 4
trainset = torchvision.datasets.CIFAR100(root='./dataone', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR100(root='./dataone', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=2)
classes = ('aquatic mammals','fish','flowers','food containers','fruit and vegetables','household electrical devices','household furniture','insects','large carnivores','large man-made outdoor things','large natural outdoor scenes','large omnivores and herbivores','medium-sized mammals','non-insect invertebrates','people','reptiles','small mammals','trees','vehicles 1','vehicles 2')
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 100)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
import torch.optim as optim
PATH = "./model.pt"
model = Net()
net = Net()
print(os.path.exists(PATH))
if os.path.exists(PATH):
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
print("using checkpoint")
#model.eval()
# - or -
model.train()
#criterion = nn.CrossEntropyLoss()
#optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
print("training..")
# print statistics
#running_loss += loss.item()
#if i % 2000 == 1999: # print every 2000 mini-batches
# print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
# running_loss = 0.0
print('Finished Training')
#PATH = './cifar_net.pth'
#torch.save(net.state_dict(), PATH)
EPOCH = 5
LOSS = 0.4
torch.save({
'epoch': EPOCH,
'model_state_dict': net.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': LOSS,
}, PATH)```
It's based on PyTorch tutorial about image cassifiers, that can be found [here](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).
I took code for resuming training from [here.](https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html)
Code that I used for testing model:
import torch
import torchvision
import torchvision.transforms as transforms
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
batch_size = 4
trainset = torchvision.datasets.CIFAR100(root='./dataone', train=False,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR100(root='./dataone', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
shuffle=False, num_workers=2)
classes = ('aquatic mammals','fish','flowers','food containers','fruit and vegetables','household electrical devices','household furniture','insects','large carnivores','large man-made outdoor things','large natural outdoor scenes','large omnivores and herbivores','medium-sized mammals','non-insect invertebrates','people','reptiles','small mammals','trees','vehicles 1','vehicles 2')
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 100)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
PATH = './cifar_net.pth'
net.load_state_dict(torch.load(PATH))
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for data in testloader:
images, labels = data
# calculate outputs by running images through the network
outputs = net(images)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(correct)
print(total)
print(f'Accuracy of the network on the 100000 test images: {100 * correct // total} %')```
It's from the same image classifier tutorial by PyTorch. I added printing total and correct detected images for testing.
How can I increase accuracy, so it will be at least around 50-70%?
Or is this normal, and it means that these 15% are incorrect?
Please help.
Have you tried increasing the number of epochs? Training usually requires hundreds to thousands of iterations to obtain good results.
You could also improve the architecture by continuing the convolutional layers until you are left with a 1×1×N image where N is the number of filters in the final convolution. Then flatten and add linear layer(s). Batch Normalization and LeakyReLU activation before pooling layers may also help. Finally, you should use Softmax activation on the output since you are dealing with a classifier.
I highly recommend looking into popular classifiers such as VGG and ResNet. ResNet in particular has a feature called "residual/skip connections" that passes a copy of the output of a layer forward down the line to compensate for feature loss.
Could you provide accuracies and loss plots so we can understand better what is happening in the training (or maybe the list of accuracies and losses during training).
Also, it is a good practice to compute the validation accuracy and loss after every epoch to monitor the behaviour of the network on unseen data.
Although, as it has been said by Xynias, there are some improvements you could do on your architecture I believe the first step would be to investigate from the accuracies and losses.
Given CIFAR100 having 100 classes, this is expectable. You'll need a resonably complex network to perform well on this task. Definitely more feature maps, starting with 64 or more channels.
This Q&D architecture surpasses 50% overall accuracy after 10 epochs or so (using learning rate of 0.1 and batch size of 256, I also added RandomHorizontalFlip() transform):
class Net(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Conv2d(3, 128, 3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(128, 128, 3, stride=1, padding=1),
nn.ReLU(),
nn.AvgPool2d(2, 2),
nn.Conv2d(128, 256, 3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(256, 256, 3, stride=1, padding=1),
nn.ReLU(),
nn.AvgPool2d(2, 2),
nn.Flatten(),
nn.Dropout(0.5),
nn.Linear(16384, 100),
)
def forward(self, x):
return self.layers(x)
For a better result you may try implementing something ResNet-like, or utilize a premade (and possibly pretrained) model, for example, using timm:
import timm
net = timm.create_model('resnet18d', pretrained=True, num_classes=100)
It achieves your target metrics pretty fast with the same parameters as above.

KL Divergence goes NaN on Bayesian Convolutional Neural Network

I'm trying to implement a Bayesian Convolutional Neural Network using Pytorch on Python 3.7. I mainly orient myself on Shridhar's implementation. When running my CNN with normalized and MNIST data, the KL Divergence is NaN after a couple of iterations. I already implemented linear layers the same way and they worked perfectly fine.
I normalized the data as follows:
train_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])), batch_size=BATCH_SIZE, shuffle=True, **LOADER_KWARGS)
eval_loader = torch.utils.data.DataLoader(datasets.MNIST('./mnist', train=False, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])), batch_size=EVAL_BATCH_SIZE, shuffle=False, **LOADER_KWARGS)
My implementation of the Conv-Layer looks as follows:
class BayesianConv2d(nn.Module):
def __init__(self, in_channels, out_channels, prior_sigma, kernel_size, stride=1, padding=0, dilation=1, groups=1):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.normal = torch.distributions.Normal(0,1)
# conv-parameters
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
# Weight parameters
self.weight_mu = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size).uniform_(0, 0.1))
self.weight_rho = nn.Parameter(torch.Tensor(out_channels, in_channels, *self.kernel_size).uniform_(-3,0.1))
self.weight_sigma = 0
self.weight = 0
# Bias parameters
self.bias_mu = nn.Parameter(torch.Tensor(out_channels).uniform_(0, 0.1))
self.bias_rho = nn.Parameter(torch.Tensor(out_channels).uniform_(-3,0.1))
self.bias_sigma = 0
self.bias = 0
# prior
self.prior_sigma = prior_sigma
def forward(self, input, sample=False, calculate_log_probs=False):
# compute sigma out of rho: sigma = log(1+e^rho)
self.weight_sigma = torch.log1p(torch.exp(self.weight_rho))
self.bias_sigma = torch.log1p(torch.exp(self.bias_rho))
# sampling process -> use local reparameterization trick
activations_mu = F.conv2d(input.to(DEVICE), self.weight_mu, self.bias_mu, self.stride, self.padding, self.dilation, self.groups)
activations_sigma = torch.sqrt(1e-16 + F.conv2d((input**2).to(DEVICE), self.weight_sigma**2, self.bias_sigma**2, self.stride, self.padding, self.dilation, self.groups))
activation_epsilon = Variable(self.weight_mu.data.new(activations_sigma.size()).normal_(mean=0, std=1))
outputs = activations_mu + activations_sigma * activation_epsilon
if self.training or calculate_log_probs:
self.kl_div = 0.5 * ((2 * torch.log(self.prior_sigma / self.weight_sigma) - 1 + (self.weight_sigma / self.prior_sigma).pow(2) + ((0 - self.weight_mu) / self.prior_sigma).pow(2)).sum() \
+ (2 * torch.log(0.1 / self.bias_sigma) - 1 + (self.bias_sigma / 0.1).pow(2) + ((0 - self.bias_mu) / 0.1).pow(2)).sum())
return outputs
The implementation of the corresponding Conv-Net looks as follows:
class BayesianConvNetwork(nn.Module):
# Set up network by definining layers
def __init__(self):
super().__init__()
self.conv1 = layers.BayesianConv2d(1, 24, prior_sigma=0.1, kernel_size = (5,5), padding=2)
self.pool1 = nn.MaxPool2d(kernel_size=3,stride=2, padding=1)
self.conv2 = layers.BayesianConv2d(24, 48, prior_sigma=0.1, kernel_size = (5,5), padding=2)
self.pool2 = nn.MaxPool2d(kernel_size=3,stride=2, padding=1)
self.conv3 = layers.BayesianConv2d(48, 64, prior_sigma=0.1, kernel_size = (5,5), padding=2)
self.pool3 = nn.MaxPool2d(kernel_size=3,stride=2, padding=1)
self.fcl1 = layers.BayesianLinearWithLocalReparamTrick(4*4*64, 256, prior_sigma=0.1)
self.fcl2 = layers.BayesianLinearWithLocalReparamTrick(256, 10, prior_sigma=0.1)
# define forward function by assigning corresponding activation functions to layers
def forward(self, x, sample=False):
x = F.relu(self.conv1(x, sample))
x = self.pool1(x)
x = F.relu(self.conv2(x, sample))
x = self.pool2(x)
x = F.relu(self.conv3(x, sample))
x = self.pool3(x)
x = x.view(-1, 4*4*64)
x = F.relu(self.fcl1(x, sample))
x = F.log_softmax(self.fcl2(x, sample), dim=1)
return x
# summing up KL-divergences to obtain overall KL-divergence-value
def total_kl_div(self):
return (self.conv1.kl_div + self.conv2.kl_div + self.conv3.kl_div + self.fcl1.kl_div + self.fcl2.kl_div)
# sampling prediction: perform prediction for each of the "different networks" that result from the weight distributions
def sample_elbo(self, input, target, batch_idx, nmbr_batches, samples=SAMPLES):
outputs = torch.zeros(samples, target.shape[0], CLASSES).to(DEVICE)
kl_divs = torch.zeros(samples).to(DEVICE)
for i in range(samples): # sample through networks
outputs[i] = self(input, sample=True) # perform prediction
kl_divs[i] = self.total_kl_div() # calculate total kl_div of the network
kl_div = kl_divs.mean() # compute mean kl_div from all samples
negative_log_likelihood = F.nll_loss(outputs.mean(0), target, size_average=False)
loss = kl_weighting * kl_div + negative_log_likelihood
return loss
Has anyone faced the same issue or knows how to solve it?
Many thanks in advance!
I figured out that it appears to be an issue with the SGD-optimizer. Using Adam as optimizer solved the problem though I don't know the reason for that. If anyone has an answer on why it works with Adam but not with SGD, feel free to comment.

Expected input batch_size (18) to match target batch_size (6)

Is RNN for image classification available only for gray image?
The following program works for gray image classification.
If RGB images are used, I have this error:
Expected input batch_size (18) to match target batch_size (6)
at this line loss = criterion(outputs, labels).
My data loading for train, valid and test are as follows.
input_size = 300
inputH = 300
inputW = 300
#Data transform (normalization & data augmentation)
stats = ((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_resize_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
train_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.RandomHorizontalFlip(),
tt.ToTensor(),
tt.Normalize(*stats)])
valid_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
test_tfms = tt.Compose([tt.Resize((inputH, inputW), interpolation=2),
tt.ToTensor(),
tt.Normalize(*stats)])
#Create dataset
train_ds = ImageFolder('./data/train', train_tfms)
valid_ds = ImageFolder('./data/valid', valid_tfms)
test_ds = ImageFolder('./data/test', test_tfms)
from torch.utils.data.dataloader import DataLoader
batch_size = 6
#Training data loader
train_dl = DataLoader(train_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Validation data loader
valid_dl = DataLoader(valid_ds, batch_size, shuffle = True, num_workers = 8, pin_memory=True)
#Test data loader
test_dl = DataLoader(test_ds, 1, shuffle = False, num_workers = 1, pin_memory=True)
My model is as follows.
num_steps = 300
hidden_size = 256 #size of hidden layers
num_classes = 5
num_epochs = 20
learning_rate = 0.001
# Fully connected neural network with one hidden layer
num_layers = 2 # 2 RNN layers are stacked
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)#batch must have first dimension
#our inpyt needs to have shape
#x -> (batch_size, seq, input_size)
self.fc = nn.Linear(hidden_size, num_classes)#this fc is after RNN. So needs the last hidden size of RNN
def forward(self, x):
#according to ducumentation of RNN in pytorch
#rnn needs input, h_0 for inputs at RNN (h_0 is initial hidden state)
#the following one is initial hidden layer
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)#first one is number of layers and second one is batch size
#output has two outputs. The first tensor contains the output features of the hidden last layer for all time steps
#the second one is hidden state f
out, _ = self.rnn(x, h0)
#output has batch_size, num_steps, hidden size
#we need to decode hidden state only the last time step
#out (N, 30, 128)
#Since we need only the last time step
#Out (N, 128)
out = out[:, -1, :] #-1 for last time step, take all for N and 128
out = self.fc(out)
return out
stacked_rnn_model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()#cross entropy has softmax at output
#optimizer = torch.optim.Adam(stacked_rnn_model.parameters(), lr=learning_rate) #optimizer used gradient optimization using Adam
optimizer = torch.optim.SGD(stacked_rnn_model.parameters(), lr=learning_rate)
# Train the model
n_total_steps = len(train_dl)
for epoch in range(num_epochs):
t_losses=[]
for i, (images, labels) in enumerate(train_dl):
# origin shape: [6, 3, 300, 300]
# resized: [6, 300, 300]
images = images.reshape(-1, num_steps, input_size).to(device)
print('images shape')
print(images.shape)
labels = labels.to(device)
# Forward pass
outputs = stacked_rnn_model(images)
print('outputs shape')
print(outputs.shape)
loss = criterion(outputs, labels)
t_losses.append(loss)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
Printing images and outputs shapes are
images shape
torch.Size([18, 300, 300])
outputs shape
torch.Size([18, 5])
Where is the mistake?
Tl;dr: You are flattening the first two axes, namely batch and channels.
I am not sure you are taking the right approach but I will write about that layer.
In any case, let's look at the issue you are facing. You have a data loader that produces (6, 3, 300, 300), i.e. batches of 6 three-channel 300x300 images. By the look of it you are looking to reshape each batch element (3, 300, 300) into (step_size=300, -1).
However instead of that you are affecting the first axis - which you shouldn't - with images.reshape(-1, num_steps, input_size). This will have the desired effect when working with a single-channel images since dim=1 wouldn't be the "channel axis". In your case your have 3 channels, therefore, the resulting shape is: (6*3*300*300//300//300, 300, 300) which is (18, 300, 300) since num_steps=300 and input_size=300. As a result you are left with 18 batch elements instead of 6.
Instead what you want is to reshape with (batch_size, num_steps, -1). Leaving the last axis (a.k.a. seq_length) of variable size. This will result in a shape (6, 300, 900).
Here is a corrected and reduced snippet:
batch_size = 6
channels = 3
inputH, inputW = 300, 300
train_ds = TensorDataset(torch.rand(100, 3, inputH, inputW), torch.rand(100, 5))
train_dl = DataLoader(train_ds, batch_size)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
# (batch_size, seq, input_size)
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
# (batch_size, hidden_size)
self.fc = nn.Linear(hidden_size, num_classes)
# (batch_size, num_classes)
def forward(self, x):
out, _ = self.rnn(x)
out = out[:, -1, :]
out = self.fc(out)
return out
num_steps = 300
input_size = inputH*inputW*channels//num_steps
hidden_size = 256
num_classes = 5
num_layers = 2
rnn = RNN(input_size, hidden_size, num_layers, num_classes)
for x, y in train_dl:
print(x.shape, y.shape)
images = images.reshape(batch_size, num_steps, -1)
print(images.shape)
outputs = rnn(images)
print(outputs.shape)
break
As I said in the beginning I am a bit wary about this approach because you are essentially feeding your RNN a RGB 300x300 image in the form of a sequence of 300 flattened vectors... I can't say if that makes sense and terms of training and if the model will be able to learn from that. I could be wrong!

Why are Pytorch and Keras implementations giving vastly different results?

I am trying to train a 1-D ConvNet for time series classification as shown in this paper (refer to FCN om Fig. 1b) https://arxiv.org/pdf/1611.06455.pdf
The Keras implementation is giving me vastly superior performance. Could someone explain why is that the case?
The code for Pytorch is as follow:
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv1d(x_train.shape[1], 128, 8)
self.bnorm1 = nn.BatchNorm1d(128)
self.conv2 = nn.Conv1d(128, 256, 5)
self.bnorm2 = nn.BatchNorm1d(256)
self.conv3 = nn.Conv1d(256, 128, 3)
self.bnorm3 = nn.BatchNorm1d(128)
self.dense = nn.Linear(128, nb_classes)
def forward(self, x):
c1=self.conv1(x)
b1 = F.relu(self.bnorm1(c1))
c2=self.conv2(b1)
b2 = F.relu(self.bnorm2(c2))
c3=self.conv3(b2)
b3 = F.relu(self.bnorm3(c3))
output = torch.mean(b3, 2)
dense1=self.dense(output)
return F.softmax(dense1)
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.5, momentum=0.99)
losses=[]
for t in range(1000):
y_pred_1= model(x_train.float())
loss_1 = criterion(y_pred_1, y_train.long())
print(t, loss_1.item())
optimizer.zero_grad()
loss_1.backward()
optimizer.step()
For comparison, I use the following code for Keras:
x = keras.layers.Input(x_train.shape[1:])
conv1 = keras.layers.Conv1D(128, 8, padding='valid')(x)
conv1 = keras.layers.BatchNormalization()(conv1)
conv1 = keras.layers.Activation('relu')(conv1)
conv2 = keras.layers.Conv1D(256, 5, padding='valid')(conv1)
conv2 = keras.layers.BatchNormalization()(conv2)
conv2 = keras.layers.Activation('relu')(conv2)
conv3 = keras.layers.Conv1D(128, 3, padding='valid')(conv2)
conv3 = keras.layers.BatchNormalization()(conv3)
conv3 = keras.layers.Activation('relu')(conv3)
full = keras.layers.GlobalAveragePooling1D()(conv3)
out = keras.layers.Dense(nb_classes, activation='softmax')(full)
model = keras.models.Model(inputs=x, outputs=out)
optimizer = keras.optimizers.SGD(lr=0.5, decay=0.0, momentum=0.99)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
hist = model.fit(x_train, Y_train, batch_size=x_train.shape[0], nb_epoch=2000)
The only difference I see between the two is the initialization but however, the results are just vastly different. For reference, I use the same preprocessing as follows for both the datasets, with a subtle difference in input shapes, for Pytorch (Batch_Size, Channels, Length) and for Keras: (Batch_Size, Length, Channels).
The reason of different results is due to different default parameters of layers and optimizer. For example in pytorch decay-rate of batch-norm is considered as 0.9, whereas in keras it is 0.99. Like that, there may be other variation in default parameters.
If you use same parameters and fixed random seed for initialization, there won't be much difference in the result for both library.

Activation gradient penalty

Here's a simple neural network, where I’m trying to penalize the norm of activation gradients:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
self.pool = nn.MaxPool2d(2, 2)
self.relu = nn.ReLU()
self.linear = nn.Linear(64 * 5 * 5, 10)
def forward(self, input):
conv1 = self.conv1(input)
pool1 = self.pool(conv1)
self.relu1 = self.relu(pool1)
self.relu1.retain_grad()
conv2 = self.conv2(relu1)
pool2 = self.pool(conv2)
relu2 = self.relu(pool2)
self.relu2 = relu2.view(relu2.size(0), -1)
self.relu2.retain_grad()
return self.linear(relu2)
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
for i in range(1000):
output = model(input)
loss = nn.CrossEntropyLoss()(output, label)
optimizer.zero_grad()
loss.backward(retain_graph=True)
grads = torch.autograd.grad(loss, [model.relu1, model.relu2], create_graph=True)
grad_norm = 0
for grad in grads:
grad_norm += grad.pow(2).sum()
grad_norm.backward()
optimizer.step()
However, it does not produce the desired regularization effect. If I do the same thing for weights (instead of activations), it works well. Am I doing this right (in terms of pytorch machinery)? Specifically, what happens in grad_norm.backward() call? I just want to make sure the weight gradients are updated, and not activation gradients. Currently, when I print out gradients for weights and activations immediately before and after that line, both change - so I’m not sure what’s going on.
I think your code ends up computing some of the gradients twice in each step. I also suspect it actually never zeroes out the activation gradients, so they accumulate across steps.
In general:
x.backward() computes gradient of x wrt. computation graph leaves (e.g. weight tensors and other variables), as well as wrt. nodes explicitly marked with retain_grad(). It accumulates the computed gradient in tensors' .grad attributes.
autograd.grad(x, [y, z]) returns gradient of x wrt. y and z regardless of whether they would normally retain grad or not. By default, it will also accumulate gradient in all leaves' .grad attributes. You can prevent this by passing only_inputs=True.
I prefer to use backward() only for the optimization step, and autograd.grad() whenever my goal is to obtain "reified" gradients as intermediate values for another computation. This way, I can be sure that no unwanted gradients remain lying around in tensors' .grad attributes after I'm done with them.
import torch
from torch import nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
self.pool = nn.MaxPool2d(2, 2)
self.relu = nn.ReLU()
self.linear = nn.Linear(64 * 5 * 5, 10)
def forward(self, input):
conv1 = self.conv1(input)
pool1 = self.pool(conv1)
self.relu1 = self.relu(pool1)
conv2 = self.conv2(self.relu1)
pool2 = self.pool(conv2)
self.relu2 = self.relu(pool2)
relu2 = self.relu2.view(self.relu2.size(0), -1)
return self.linear(relu2)
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
grad_penalty_weight = 10.
for i in range(1000000):
# Random input and labels; we're not really learning anything
input = torch.rand(1, 3, 32, 32)
label = torch.randint(0, 10, (1,))
output = model(input)
loss = nn.CrossEntropyLoss()(output, label)
# This is where the activation gradients are computed
# only_inputs is optional here, since we're going to call optimizer.zero_grad() later
# But it makes clear that we're *only* interested in the activation gradients at this point
grads = torch.autograd.grad(loss, [model.relu1, model.relu2], create_graph=True, only_inputs=True)
grad_norm = 0
for grad in grads:
grad_norm += grad.pow(2).sum()
optimizer.zero_grad()
loss = loss + grad_norm * grad_penalty_weight
loss.backward()
optimizer.step()
This code appears to work, in that the activation gradients do get smaller.
I cannot comment on the viability of this technique as a regularization method.

Resources