Implementing 1D self attention in PyTorch - pytorch

I'm trying to implement the 1D self-attention block below using PyTorch:
proposed in the following paper. Below you can find my (provisional) attempt:
import torch.nn as nn
import torch
#INPUT shape ((B), CH, H, W)
class Self_Attention1D(nn.Module):
def __init__(self, in_channels=1, out_channels=3):
super().__init__()
self.pointwise_conv1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=(1,1))
self.pointwise_conv2 = nn.Conv1d(in_channels=out_channels, out_channels=in_channels, kernel_size=(1,1))
self.phi = MLP(in_size = out_channels, out_size=32)
self.psi = MLP(in_size = out_channels, out_size=32)
self.gamma = MLP(in_size=32, out_size=out_channels)
def forward(self, x):
x = self.pointwise_conv1(x)
phi = self.phi(x.transpose(1,3))
psi = self.psi(x.transpose(1,3))
delta = phi-psi
gamma = self.gamma(delta).transpose(3,1)
out = self.pointwise_conv2(torch.mul(gamma,x))
return out
class MLP(nn.Module):
def __init__(self, in_size, out_size):
super().__init__()
self.in_size = in_size
self.out_size = out_size
self.layers = nn.Sequential(
nn.Linear(in_size, 64),
nn.ReLU(),
nn.Linear(64,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,out_size))
def forward(self, x):
out = self.layers(x)
return out
I'm not sure at all that this is correct, as the operations in my implementation are happening globally while as displayed in the image we should compute some operation between each entry and its neighbours one at a time. I was initially tempted to instantiate a for loop to iteratively compute the neural networks delta,phi,psi for each entry, but I felt that it wasn't the right way to do that.
Apologies if this is trivial but I still don't have a huge experience in PyTorch.

Related

How does `optimizer.step()` perform an in-place operation?

Here is a simple example that results in an in-place operation error.
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=3,
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
shared_net = SharedNetwork()
net_1 = Network1()
segmentor = Network2()
optimizer = optim.Adam(list(shared_net.parameters()) + list(segmentor.parameters()), lr=1e-6)
optimizer_conf = optim.Adam(list(shared_net.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
optimizer.zero_grad()
optimizer_conf.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
optimizer.step()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
Error message:
UserWarning: Error detected in ConvolutionBackward0. No forward pass information available. Enable detect anomaly during forward pass for more information. (Triggered internally at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:97.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 3, 3, 3]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I was able to solve the problem by changing the order of running the step function of optimizers.
optimizer_conf.zero_grad()
optimizer.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
optimizer.step()
The following questions, however, remain:
How does the step method cause an inplace operation in convolution?
Why does moving the steps to the end of the file resolve this error?
NOTE: The loss function is used for simplicity, using dice-loss also results in the same error!
Before answering the question, I have to mention that it seems having multiple optimizers for one set of parameters is anti-pattern and it's better to be avoided.
How does the step method cause an inplace operation in convolution?
A: step method adds the gradients to the weights, so it does something like the following:
param.weight += param.grad
which can be interpreted as an in place operation
Why does moving the steps to the end of the file resolve this error?
A: Obviously, by moving the step method after the second backward method, the above-mentioned operation is not executed. As a result, there are no in-place operations and no errors raised due to their existence.
To sum up, it's best to have only one optimizer for one set of parameters, the previous example could coded in the following way:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=(3,3),
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
torch.manual_seed(0)
shared_net = SharedNetwork()
net_1 = Network1()
net_2 = Network2()
shared_optimizer = optim.Adam(list(shared_net.parameters()), lr=1e-6)
net_1_optimizer = optim.Adam(list(net_1.parameters()), lr=1e-6)
net_2_optimizer = optim.Adam(list(segmentor.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
net_2_optimizer.zero_grad()
features = shared_net(fake_data)
net_2_out = net_2(features)
s_loss = loss_fn(net_2_out, target_data_2)
s_loss.backward(retain_graph=True)
net_2_optimizer.step()
net_1_optimizer.zero_grad()
shared_optimizer.zero_grad()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
net_1_optimizer.step()
shared_optimizer.step()
Note: If you want to have two different learning rates for different losses applied to one set of parameters, you can multiply the losses based on their importance by a value. For example, you can multiply loss_1 by 0.1 and loss_1 by 0.5. Or, you can use backward hooks as mentioned in this comment:
backward-hook

How to rewrite the torch.nn.LayerNorm function?

I need to rewrite the layer normalization with torch without parameters to adjust different data size.
I have checked the API document of nn.LayerNorm
and made some implementations with torch and numpy. In my test results, there is a few difference with torch and totally equal with numpy. Comparing with nn.LayerNorm with elementwise_affine =True, the torch implementation doesn't perform so well, and the numpy implementation perform very poor. Is there any problem? Can't I directly use them like nn.LayerNorm ?
With torch
class Layer_norm(nn.Module):
def __init__(self, eps=1e-6):
super(Layer_norm, self).__init__()
self.eps = eps
def forward(self, x):
mean = torch.mean(x, dim=(1, 2, 3), keepdim=True)
var = torch.var(x, dim=(1, 2, 3), keepdim=True)
std = torch.sqrt(var + self.eps)
return (x - mean[:,None,None,None]) / std[:,None,None,None]
With numpy
class Layer_norm(nn.Module):
def __init__(self, eps=1e-5):
super(Layer_norm, self).__init__()
self.eps = eps
def forward(self, x):
mean = np.mean(x.cpu().numpy(), axis=(1,2,3))
var = np.var(x.cpu().numpy(), axis=(1,2,3))
div = np.sqrt(var+1e-5)
out = (x.cpu().numpy()-mean[:,None,None,None])/div[:,None,None,None]
return torch.from_numpy(out).float.cuda()

How can I train the last few layers of RegNet-800MF backbone using PyTorch Lightning

I am trying to get better results by allowing a few final layers of a previously frozen backbone (RegNet-800MF) to be trained. How can I implement this in PyTorch Lightning? I am very new to ML so please excuse me if I have left any important information out.
My model (MechClassifier) calls another class (ParametersClassifier) which includes the pre-trained RegNet as its frozen backbone. During training the forward function passes inputs only through the backbone of the ParametersClassifier and not the Classifying layers. I will include the init functions of both below.
My MechClassifier model:
class MechClassifier(pl.LightningModule):
def __init__(
self,
num_classes,
lr=4e-3,
weight_decay=1e-8,
gpus=1,
max_epochs=30,
):
super().__init__()
self.lr = lr
self.weight_decay = weight_decay
self.__dict__.update(locals())
self.backbone = ParametersClassifier.load_from_checkpoint(
checkpoint_path="checkpoints/param_classifier/last.ckpt",
num_classes=3,
gpus=1,
)
self.backbone.freeze()
self.backbone.eval()
self.mf_classifier = nn.Sequential(
nn.Linear(self.backbone.num_ftrs, 8),
nn.ReLU(),
nn.Linear(8, num_classes),
)
self.wd_classifier = nn.Sequential(
nn.Linear(self.backbone.num_ftrs, 8),
nn.ReLU(),
nn.Linear(8, num_classes),
)
def forward(self, x):
self.backbone.eval()
with torch.no_grad():
x = self.backbone.model(x)
# x = self.model(x)
out1 = self.mf_classifier(x)
out2 = self.wd_classifier(x)
# print(out1.size())
return (out1, out2)
ParametersClassifier (loaded from checkpoint):
class ParametersClassifier(pl.LightningModule):
def __init__(
self,
num_classes,
lr=4e-3,
weight_decay=0.05,
gpus=1,
max_epochs=30,
):
super().__init__()
self.lr = lr
self.weight_decay = weight_decay
self.__dict__.update(locals())
self.model = models.regnet_y_800mf(pretrained=True)
self.num_ftrs = self.model.fc.in_features
self.model.fc = nn.Identity()
self.fc1 = nn.Linear(self.num_ftrs, num_classes)
self.fc2 = nn.Linear(self.num_ftrs, num_classes)
self.fc3 = nn.Linear(self.num_ftrs, num_classes)
self.fc4 = nn.Linear(self.num_ftrs, num_classes)
def forward(self, x):
x = self.model(x)
out1 = self.fc1(x)
out2 = self.fc2(x)
out3 = self.fc3(x)
out4 = self.fc4(x)
return (out1, out2, out3, out4)
You can look at the implementation for the Regnet model you are using here. Its forward function:
def forward(self, x: Tensor) -> Tensor:
x = self.stem(x)
x = self.trunk_output(x)
x = self.avgpool(x)
x = x.flatten(start_dim=1)
x = self.fc(x)
return x
Instead of using a torch.no_grad context manager as you did, you should rather switch on/off the requires_grad as necessary. By default module parameters have their requires_grad flag set to True which means they are able to perform gradient computation. If this flag is set to False, you can consider those components as frozen.
Depending on which layers you want to freeze and those that you want to finetune, you can manually do that. For example, if you want to freeze the backbone and finetune the fully connected layer of the Regnet, and replace the following from MechClassifier's __init__:
self.backbone.freeze()
self.backbone.eval()
With the following lines:
## freeze all
self.backbone.model.requires_grad_(False)
## unfreeze last section of 4th block of backbone
block4_section1 = getattr(self.backbone.model.trunk_output.block4, 'block4-1')
block4_section1.requires_grad_(True)
And perform inference on MechClassifier with a forward function like so:
def forward(self, x):
self.backbone.eval()
x = self.backbone.model(x)
out1 = self.mf_classifier(x)
out2 = self.wd_classifier(x)
return (out1, out2)

How to drop running stats to default value for Norm layer in pyTorch?

I trained model on some images. Now to fit similar dataset but with another colors I want to load this model but also i want to drop all running stats from Batchnorm layers (set them to default value, like totally untrained). What parameters should i reset? Simple model looks like this
import torch
import torch.nn as nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv0 = nn.Conv2d(3, 3, 3, padding = 1)
self.norm = nn.BatchNorm2d(3)
self.conv = nn.Conv2d(3, 3, 3, padding = 1)
def forward(self, x):
x = self.conv0(x)
x = self.norm(x)
return self.conv(x)
net = Net()
##or for pretrained it will be
##net = torch.load('net.pth')
def drop_to_default():
for m in net.modules():
if type(m) == nn.BatchNorm2d:
####???####
drop_to_default()
Simplest way to do that is to run reset_running_stats() method on BatchNorm objects:
def drop_to_default():
for m in net.modules():
if type(m) == nn.BatchNorm2d:
m.reset_running_stats()
Below is this method's source code:
def reset_running_stats(self) -> None:
if self.track_running_stats:
# running_mean/running_var/num_batches... are registered at runtime depending
# if self.track_running_stats is on
self.running_mean.zero_() # Zero (neutral) mean
self.running_var.fill_(1) # One (neutral) variance
self.num_batches_tracked.zero_() # Number of batches tracked
You can see the source code here, _NormBase class.

Tensor size mismatch autoencoder pytorch

I'm using stacked Autoencoder, which is a bunch of Conv layers.
However, I'm having a tensor mismatch error, and I'm not sure about the reason. Everything done in the Encoder is reversed in the Decoder!
This is for time-series data. Input shape is (bactch_size, 1, 3000)
Here's the code
class CDAutoEncoder(nn.Module):
def __init__(self, input_size, output_size, kernel, stride):
super(CDAutoEncoder, self).__init__()
self.forward_pass = nn.Sequential(
nn.Conv1d(input_size, output_size, kernel_size=kernel, stride=stride, padding=0),
nn.PReLU(),
)
self.backward_pass = nn.Sequential(
nn.ConvTranspose1d(output_size, input_size, kernel_size=kernel, stride=stride, padding=0),
nn.PReLU(),
)
def forward(self, x):
y = self.forward_pass(x)
return y
def reconstruct(self, x):
return self.backward_pass(x)
class StackedAutoEncoder(nn.Module):
def __init__(self):
super(StackedAutoEncoder, self).__init__()
self.ae1 = CDAutoEncoder(1, 32, 50, 10)
self.ae2 = CDAutoEncoder(32, 64, 10, 3)
self.ae3 = CDAutoEncoder(64, 64, 5, 1)
def forward(self, x):
a1 = self.ae1(x)
a2 = self.ae2(a1)
a3 = self.ae3(a2)
return self.reconstruct(a3)
def reconstruct(self, x):
a2_reconstruct = self.ae3.reconstruct(x)
a1_reconstruct = self.ae2.reconstruct(a2_reconstruct)
x_reconstruct = self.ae1.reconstruct(a1_reconstruct)
return x_reconstruct
The error:
RuntimeError: The size of tensor a (2990) must match the size of tensor b (3000) at non-singleton dimension 2
I've tried adding padding and it worked, but when I changed the kernel size I get different tensor-size-mismatch-error.
Apparently, there's nothing like 'same' padding, so is there automated solution for this?

Resources