Implementing a simple ResNet block with PyTorch - python-3.x

I'm trying to implement following ResNet block, which ResNet consists of blocks with two convolutional layers and a skip connection. For some reason it doesn't add the output of skip connection, if applied, or input to the output of convolution layers.
The ResNet block has:
Two convolutional layers with:
3x3 kernel
no bias terms
padding with one pixel on both sides
2d batch normalization after each convolutional layer
The skip connection:
simply copies the input if the resolution and the number of channels do not change.
if either the resolution or the number of channels change, the skip connection should have one convolutional layer with:
1x1 convolution without bias
change of the resolution with stride (optional)
different number of input channels and output channels (optional)
the 1x1 convolutional layer is followed by 2d batch normalization.
The ReLU nonlinearity is applied after the first convolutional layer and at the end of the block.
My code:
class Block(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
"""
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
stride (int): Controls the stride.
"""
super(Block, self).__init__()
self.skip = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.skip = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channels))
else:
self.skip = None
self.block = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1, stride=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(),
nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1, stride=1, bias=False),
nn.BatchNorm2d(out_channels))
def forward(self, x):
out = self.block(x)
if self.skip is not None:
out = self.skip(x)
else:
out = x
out += x
out = F.relu(out)
return out

The problem is in the reuse of the out variable. Normally, you'd implement like this:
def forward(self, x):
identity = x
out = self.block(x)
if self.skip is not None:
identity = self.skip(x)
out += identity
out = F.relu(out)
return out
If you like "one-liners":
def forward(self, x):
out = self.block(x)
out += (x if self.skip is None else self.skip(x))
out = F.relu(out)
return out
If you really like one-liners (please, that is too much, do not choose this option :))
def forward(self, x):
return F.relu(self.block(x) + (x if self.skip is None else self.skip(x)))

Related

Pretrained ResNet-50 on ImageNet as CAE encoder performs very poor

I am experementing with different Convolutional Autoencoder Arcitectures now and I have decided to try pretrained ResnNet50 network as encoder in my model. I tried to options: use encoder without changing weights and use encoder using pretrained weights as initial. I had better results of reconstructing training weights of ResNet, but it still cannot outperform my basic CAE with 3 conv layer in encoder and 3 upsample + conv2d layers in decoder. Can you help me with the explanation of such effect? I am training my network on different domains of OfficeHome dataset. Here is my arcitecture:
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super(ConvBlock, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)
self.relu = nn.LeakyReLU(inplace=True)
self.norm = nn.BatchNorm2d(out_channels)
def forward(self, x):
x = self.conv(x)
x = self.relu(x)
x = self.norm(x)
return x
class ResnetConvAutoencoder(nn.Module):
def __init__(self):
super(ResnetConvAutoencoder, self).__init__()
resnet = models.resnet50(pretrained = True, progress = True)
# for param in resnet.parameters():
# param.requires_grad = False
self.encoder = torch.nn.Sequential(*(list(resnet.children())[:-2])) #output is 2048 x 8 x 8
del resnet
self.decoder = nn.Sequential(
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(2048, 512),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(512, 256),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(256, 128),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(128, 16),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(16, 3),
nn.Sigmoid()
) # output is 3 x 256 x 256 (as input)
self.encoder.cuda(2) # trainig on several GPUs
self.decoder.cuda(1)
def forward(self, x):
x = x.cuda(2)
x = self.encoder(x).cuda(1)
x = self.decoder(x).cuda(1)
return x
And some images examples of my simple CAE and CAE with ResNet encoder
Loss on train and test in CAE with ResNet-50 is about 0.03 and do not go lower. Loss in my CAE is lower than 0.015 (MSE).

pytorch CNN get label for a single image

I'm getting stuck on a function that is supposed to predict the label of a single image. I need to do this on a single image because I want to build a web app, where the user can upload an image and can get its prediction.
My CNN is the following with the base for the model :
class ImageClassificationBase(nn.Module):
def training_step(self, batch):
images, labels = batch
out = self(images) # Generate predictions
loss = F.cross_entropy(out, labels) # Calculate loss
return loss
def validation_step(self, batch):
images, labels = batch
out = self(images) # Generate predictions
loss = F.cross_entropy(out, labels) # Calculate loss
acc = accuracy(out, labels) # Calculate accuracy
return {'val_loss': loss.detach(), 'val_acc': acc}
def validation_epoch_end(self, outputs):
batch_losses = [x['val_loss'] for x in outputs]
epoch_loss = torch.stack(batch_losses).mean() # Combine losses
batch_accs = [x['val_acc'] for x in outputs]
epoch_acc = torch.stack(batch_accs).mean() # Combine accuracies
return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
epoch, result['train_loss'], result['val_loss'], result['val_acc']))
and the model itself:
class BrainTumorClassification(ImageClassificationBase):
def __init__(self):
super().__init__()
self.network = nn.Sequential(
nn.Conv2d(3, 32, kernel_size = 3, padding = 1),
nn.ReLU(),
nn.Conv2d(32,64, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Conv2d(64, 128, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.Conv2d(128 ,128, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Conv2d(128, 256, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.Conv2d(256,256, kernel_size = 3, stride = 1, padding = 1),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Flatten(),
nn.Linear(82944,1024),
nn.ReLU(),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Linear(512,6))
def forward(self, xb):
return self.network(xb)
The function I'm trying to implement for testing a single image is the following:
from torch.autograd import Variable
transformer = transforms.Compose([
transforms.Resize((150,150)), transforms.ToTensor()])
def classify(image_path,image_transforms, classes):
image = Image.open(image_path)
image_tensor = image_transforms(image).float()
image_tensor = image_tensor.unsqueeze_(0)
input = Variable(image_tensor)
output = model(input)
index = output.data.numpy().argmax()
pred = classes[index]
return pred
I'm getting an error:
`pred=classes[index]` index out of range
I should mention that classes has 4 elements : ['glioma_tumor', 'meningioma_tumor', 'no_tumor', 'pituitary_tumor'].
A few points to note:
Don't forget to load your trained network on your initialized model.
Variable has been deprecated, you should not use it. Gradients are tracked on tensors that have the requires_grad flag on. Here you are only inferring so you can actually use the torch.no_grad context to avoid retaining parameter activations. This will increase inference speed.
torch.Tensor.unsqueeze_, you don't have to reassign the result as the input itself is modified by the function. As a general note, all torch.Tensor functions with a _ suffix are in-place operators.
Most of all, you mentioned only having 4 classes, yet your last fully connected layer outputs 6 logits. In this case, you need to change this to 4.
Here is a possible modification:
transformer = transforms.Compose([transforms.Resize((150,150)),
transforms.ToTensor()])
#torch.no_grad()
def classify(image_path,image_transforms, classes):
image = Image.open(image_path)
image_tensor = image_transforms(image)
image_tensor.unsqueeze_(0)
output = model(image_tensor)
index = output.data.numpy().argmax()
pred = classes[index]
return pred

RuntimeError: Expected hidden[0] size (1, 1, 512), got (1, 128, 512) for LSTM pytorch

I trained the LSTM with a batch size of 128 and during testing my batch size is 1, why do I get this error? I'm suppose to initialize the hidden size when doing testing?
Here is the code that i'm using, I initialize the hidden state init_hidden function as (number_of_layers, batch_size, hidden_size) since batch_first=True
class ImageLSTM(nn.Module):
def __init__(self, n_inputs:int=49,
n_outputs:int=4096,
n_hidden:int=256,
n_layers:int=1,
bidirectional:bool=False):
"""
Takes a 1D flatten images.
"""
super(ImageLSTM, self).__init__()
self.n_inputs = n_inputs
self.n_hidden = n_hidden
self.n_outputs = n_outputs
self.n_layers = n_layers
self.bidirectional = bidirectional
self.lstm = nn.LSTM( input_size=self.n_inputs,
hidden_size=self.n_hidden,
num_layers=self.n_layers,
dropout = 0.5 if self.n_layers>1 else 0,
bidirectional=self.bidirectional,
batch_first=True)
if (self.bidirectional):
self.FC = nn.Sequential(
nn.Linear(self.n_hidden*2, self.n_outputs),
nn.Dropout(p=0.5),
nn.Sigmoid()
)
else:
self.FC = nn.Sequential(
nn.Linear(self.n_hidden, self.n_outputs),
# nn.Dropout(p=0.5),
nn.Sigmoid()
)
def init_hidden(self, batch_size, device=None): # input 4D tensor: (batch size, channels, width, height)
# initialize the hidden and cell state to zero
# vectors:(number of layer, batch size, number of hidden nodes)
if (self.bidirectional):
h0 = torch.zeros(2*self.n_layers, batch_size, self.n_hidden)
c0 = torch.zeros(2*self.n_layers, batch_size, self.n_hidden)
else:
h0 = torch.zeros(self.n_layers, batch_size, self.n_hidden)
c0 = torch.zeros(self.n_layers, batch_size, self.n_hidden)
if device is not None:
h0 = h0.to(device)
c0 = c0.to(device)
self.hidden = (h0,c0)
def forward(self, X): # X: tensor of shape (batch_size, channels, width, height)
# forward propagate LSTM
lstm_out, self.hidden = self.lstm(X, self.hidden) # lstm_out: tensor of shape (batch_size, seq_length, hidden_size)
# Decode the hidden state of the last time step
out = self.FC(lstm_out[:, -1, :])
return out
please edit your post and add code. How did you initialize the hidden-state? What does you model look like.
hidden[0] is not your hidden-size, its the hidden-state of the lstm. The shape of the hidden-state has to be initialized like this:
hidden = ( torch.zeros((batch_size, layers, hidden_size)), torch.zeros((layers, batch_size, hidden_size)) )
You seem to have done this correctly. But the error tells you that you gave a batch of size 1 (because as you said you want to test with only one sample) but the hidden-state is initialized with batch-size=128.
So I guess (please add code) that you hard-coded that the batch-size = 128. Dont do that. Since you have to reinitialize the hidden-state every forward pass you can do this:
...
def forward(self, x):
batch_size = x.shape[0]
hidden = (torch.zeros(self.layers, batch_size, self.hidden_size).to(device=device), torch.zeros(self.layers, batch_size, self.hidden_size).to(device=device))
output, hidden = lstm(x, hidden)
# then do what every you want with the output
I guess that this is what causes this error but please post your code, too!

Tensor size mismatch autoencoder pytorch

I'm using stacked Autoencoder, which is a bunch of Conv layers.
However, I'm having a tensor mismatch error, and I'm not sure about the reason. Everything done in the Encoder is reversed in the Decoder!
This is for time-series data. Input shape is (bactch_size, 1, 3000)
Here's the code
class CDAutoEncoder(nn.Module):
def __init__(self, input_size, output_size, kernel, stride):
super(CDAutoEncoder, self).__init__()
self.forward_pass = nn.Sequential(
nn.Conv1d(input_size, output_size, kernel_size=kernel, stride=stride, padding=0),
nn.PReLU(),
)
self.backward_pass = nn.Sequential(
nn.ConvTranspose1d(output_size, input_size, kernel_size=kernel, stride=stride, padding=0),
nn.PReLU(),
)
def forward(self, x):
y = self.forward_pass(x)
return y
def reconstruct(self, x):
return self.backward_pass(x)
class StackedAutoEncoder(nn.Module):
def __init__(self):
super(StackedAutoEncoder, self).__init__()
self.ae1 = CDAutoEncoder(1, 32, 50, 10)
self.ae2 = CDAutoEncoder(32, 64, 10, 3)
self.ae3 = CDAutoEncoder(64, 64, 5, 1)
def forward(self, x):
a1 = self.ae1(x)
a2 = self.ae2(a1)
a3 = self.ae3(a2)
return self.reconstruct(a3)
def reconstruct(self, x):
a2_reconstruct = self.ae3.reconstruct(x)
a1_reconstruct = self.ae2.reconstruct(a2_reconstruct)
x_reconstruct = self.ae1.reconstruct(a1_reconstruct)
return x_reconstruct
The error:
RuntimeError: The size of tensor a (2990) must match the size of tensor b (3000) at non-singleton dimension 2
I've tried adding padding and it worked, but when I changed the kernel size I get different tensor-size-mismatch-error.
Apparently, there's nothing like 'same' padding, so is there automated solution for this?

Activation gradient penalty

Here's a simple neural network, where I’m trying to penalize the norm of activation gradients:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
self.pool = nn.MaxPool2d(2, 2)
self.relu = nn.ReLU()
self.linear = nn.Linear(64 * 5 * 5, 10)
def forward(self, input):
conv1 = self.conv1(input)
pool1 = self.pool(conv1)
self.relu1 = self.relu(pool1)
self.relu1.retain_grad()
conv2 = self.conv2(relu1)
pool2 = self.pool(conv2)
relu2 = self.relu(pool2)
self.relu2 = relu2.view(relu2.size(0), -1)
self.relu2.retain_grad()
return self.linear(relu2)
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
for i in range(1000):
output = model(input)
loss = nn.CrossEntropyLoss()(output, label)
optimizer.zero_grad()
loss.backward(retain_graph=True)
grads = torch.autograd.grad(loss, [model.relu1, model.relu2], create_graph=True)
grad_norm = 0
for grad in grads:
grad_norm += grad.pow(2).sum()
grad_norm.backward()
optimizer.step()
However, it does not produce the desired regularization effect. If I do the same thing for weights (instead of activations), it works well. Am I doing this right (in terms of pytorch machinery)? Specifically, what happens in grad_norm.backward() call? I just want to make sure the weight gradients are updated, and not activation gradients. Currently, when I print out gradients for weights and activations immediately before and after that line, both change - so I’m not sure what’s going on.
I think your code ends up computing some of the gradients twice in each step. I also suspect it actually never zeroes out the activation gradients, so they accumulate across steps.
In general:
x.backward() computes gradient of x wrt. computation graph leaves (e.g. weight tensors and other variables), as well as wrt. nodes explicitly marked with retain_grad(). It accumulates the computed gradient in tensors' .grad attributes.
autograd.grad(x, [y, z]) returns gradient of x wrt. y and z regardless of whether they would normally retain grad or not. By default, it will also accumulate gradient in all leaves' .grad attributes. You can prevent this by passing only_inputs=True.
I prefer to use backward() only for the optimization step, and autograd.grad() whenever my goal is to obtain "reified" gradients as intermediate values for another computation. This way, I can be sure that no unwanted gradients remain lying around in tensors' .grad attributes after I'm done with them.
import torch
from torch import nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=5)
self.conv2 = nn.Conv2d(32, 64, kernel_size=5)
self.pool = nn.MaxPool2d(2, 2)
self.relu = nn.ReLU()
self.linear = nn.Linear(64 * 5 * 5, 10)
def forward(self, input):
conv1 = self.conv1(input)
pool1 = self.pool(conv1)
self.relu1 = self.relu(pool1)
conv2 = self.conv2(self.relu1)
pool2 = self.pool(conv2)
self.relu2 = self.relu(pool2)
relu2 = self.relu2.view(self.relu2.size(0), -1)
return self.linear(relu2)
model = Net()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
grad_penalty_weight = 10.
for i in range(1000000):
# Random input and labels; we're not really learning anything
input = torch.rand(1, 3, 32, 32)
label = torch.randint(0, 10, (1,))
output = model(input)
loss = nn.CrossEntropyLoss()(output, label)
# This is where the activation gradients are computed
# only_inputs is optional here, since we're going to call optimizer.zero_grad() later
# But it makes clear that we're *only* interested in the activation gradients at this point
grads = torch.autograd.grad(loss, [model.relu1, model.relu2], create_graph=True, only_inputs=True)
grad_norm = 0
for grad in grads:
grad_norm += grad.pow(2).sum()
optimizer.zero_grad()
loss = loss + grad_norm * grad_penalty_weight
loss.backward()
optimizer.step()
This code appears to work, in that the activation gradients do get smaller.
I cannot comment on the viability of this technique as a regularization method.

Resources