How does `optimizer.step()` perform an in-place operation? - pytorch

Here is a simple example that results in an in-place operation error.
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=3,
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
shared_net = SharedNetwork()
net_1 = Network1()
segmentor = Network2()
optimizer = optim.Adam(list(shared_net.parameters()) + list(segmentor.parameters()), lr=1e-6)
optimizer_conf = optim.Adam(list(shared_net.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
optimizer.zero_grad()
optimizer_conf.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
optimizer.step()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
Error message:
UserWarning: Error detected in ConvolutionBackward0. No forward pass information available. Enable detect anomaly during forward pass for more information. (Triggered internally at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:97.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1, 3, 3, 3]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I was able to solve the problem by changing the order of running the step function of optimizers.
optimizer_conf.zero_grad()
optimizer.zero_grad()
features = shared_net(fake_data)
segmented = segmentor(features)
s_loss = loss_fn(segmented, target_data_2)
s_loss.backward(retain_graph=True)
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
optimizer_conf.step()
optimizer.step()
The following questions, however, remain:
How does the step method cause an inplace operation in convolution?
Why does moving the steps to the end of the file resolve this error?
NOTE: The loss function is used for simplicity, using dice-loss also results in the same error!

Before answering the question, I have to mention that it seems having multiple optimizers for one set of parameters is anti-pattern and it's better to be avoided.
How does the step method cause an inplace operation in convolution?
A: step method adds the gradients to the weights, so it does something like the following:
param.weight += param.grad
which can be interpreted as an in place operation
Why does moving the steps to the end of the file resolve this error?
A: Obviously, by moving the step method after the second backward method, the above-mentioned operation is not executed. As a result, there are no in-place operations and no errors raised due to their existence.
To sum up, it's best to have only one optimizer for one set of parameters, the previous example could coded in the following way:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch import optim
torch.autograd.set_detect_anomaly(True)
class Loss(nn.Module):
def __init__(self):
super(Loss, self).__init__()
def forward(self, x, target):
return x[0,0,0,0]
def block(in_channels, features, name):
return nn.Conv2d(in_channels=in_channels,
out_channels=features,
kernel_size=(3,3),
padding=1,
bias=False)
class SharedNetwork(nn.Module):
def __init__(self):
super().__init__()
self.shared_layer = block(in_channels=3, features=1, name="wow")
def forward(self, x):
x = self.shared_layer(x)
return x
class Network1(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-1")
def forward(self, x):
return self.conv(x)
class Network2(nn.Module):
def __init__(self):
super().__init__()
self.conv = block(in_channels=1, features=1, name="wow-2")
def forward(self, x):
return torch.sigmoid(self.conv(x))
torch.manual_seed(0)
shared_net = SharedNetwork()
net_1 = Network1()
net_2 = Network2()
shared_optimizer = optim.Adam(list(shared_net.parameters()), lr=1e-6)
net_1_optimizer = optim.Adam(list(net_1.parameters()), lr=1e-6)
net_2_optimizer = optim.Adam(list(segmentor.parameters()), lr=1e-6)
loss_fn = Loss()
# 2. Run a forward pass
fake_data = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_1 = torch.randint(0,255,(1, 3, 256, 256))/255
target_data_2 = torch.randint(0,255,(1, 3, 256, 256))/255
net_2_optimizer.zero_grad()
features = shared_net(fake_data)
net_2_out = net_2(features)
s_loss = loss_fn(net_2_out, target_data_2)
s_loss.backward(retain_graph=True)
net_2_optimizer.step()
net_1_optimizer.zero_grad()
shared_optimizer.zero_grad()
out_1 = net_1(features)
loss = loss_fn(out_1, target_data_1)
loss.backward(retain_graph=False)
net_1_optimizer.step()
shared_optimizer.step()
Note: If you want to have two different learning rates for different losses applied to one set of parameters, you can multiply the losses based on their importance by a value. For example, you can multiply loss_1 by 0.1 and loss_1 by 0.5. Or, you can use backward hooks as mentioned in this comment:
backward-hook

Related

How to rewrite the torch.nn.LayerNorm function?

I need to rewrite the layer normalization with torch without parameters to adjust different data size.
I have checked the API document of nn.LayerNorm
and made some implementations with torch and numpy. In my test results, there is a few difference with torch and totally equal with numpy. Comparing with nn.LayerNorm with elementwise_affine =True, the torch implementation doesn't perform so well, and the numpy implementation perform very poor. Is there any problem? Can't I directly use them like nn.LayerNorm ?
With torch
class Layer_norm(nn.Module):
def __init__(self, eps=1e-6):
super(Layer_norm, self).__init__()
self.eps = eps
def forward(self, x):
mean = torch.mean(x, dim=(1, 2, 3), keepdim=True)
var = torch.var(x, dim=(1, 2, 3), keepdim=True)
std = torch.sqrt(var + self.eps)
return (x - mean[:,None,None,None]) / std[:,None,None,None]
With numpy
class Layer_norm(nn.Module):
def __init__(self, eps=1e-5):
super(Layer_norm, self).__init__()
self.eps = eps
def forward(self, x):
mean = np.mean(x.cpu().numpy(), axis=(1,2,3))
var = np.var(x.cpu().numpy(), axis=(1,2,3))
div = np.sqrt(var+1e-5)
out = (x.cpu().numpy()-mean[:,None,None,None])/div[:,None,None,None]
return torch.from_numpy(out).float.cuda()

Implementing 1D self attention in PyTorch

I'm trying to implement the 1D self-attention block below using PyTorch:
proposed in the following paper. Below you can find my (provisional) attempt:
import torch.nn as nn
import torch
#INPUT shape ((B), CH, H, W)
class Self_Attention1D(nn.Module):
def __init__(self, in_channels=1, out_channels=3):
super().__init__()
self.pointwise_conv1 = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=(1,1))
self.pointwise_conv2 = nn.Conv1d(in_channels=out_channels, out_channels=in_channels, kernel_size=(1,1))
self.phi = MLP(in_size = out_channels, out_size=32)
self.psi = MLP(in_size = out_channels, out_size=32)
self.gamma = MLP(in_size=32, out_size=out_channels)
def forward(self, x):
x = self.pointwise_conv1(x)
phi = self.phi(x.transpose(1,3))
psi = self.psi(x.transpose(1,3))
delta = phi-psi
gamma = self.gamma(delta).transpose(3,1)
out = self.pointwise_conv2(torch.mul(gamma,x))
return out
class MLP(nn.Module):
def __init__(self, in_size, out_size):
super().__init__()
self.in_size = in_size
self.out_size = out_size
self.layers = nn.Sequential(
nn.Linear(in_size, 64),
nn.ReLU(),
nn.Linear(64,128),
nn.ReLU(),
nn.Linear(128,64),
nn.ReLU(),
nn.Linear(64,out_size))
def forward(self, x):
out = self.layers(x)
return out
I'm not sure at all that this is correct, as the operations in my implementation are happening globally while as displayed in the image we should compute some operation between each entry and its neighbours one at a time. I was initially tempted to instantiate a for loop to iteratively compute the neural networks delta,phi,psi for each entry, but I felt that it wasn't the right way to do that.
Apologies if this is trivial but I still don't have a huge experience in PyTorch.

How to drop running stats to default value for Norm layer in pyTorch?

I trained model on some images. Now to fit similar dataset but with another colors I want to load this model but also i want to drop all running stats from Batchnorm layers (set them to default value, like totally untrained). What parameters should i reset? Simple model looks like this
import torch
import torch.nn as nn
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv0 = nn.Conv2d(3, 3, 3, padding = 1)
self.norm = nn.BatchNorm2d(3)
self.conv = nn.Conv2d(3, 3, 3, padding = 1)
def forward(self, x):
x = self.conv0(x)
x = self.norm(x)
return self.conv(x)
net = Net()
##or for pretrained it will be
##net = torch.load('net.pth')
def drop_to_default():
for m in net.modules():
if type(m) == nn.BatchNorm2d:
####???####
drop_to_default()
Simplest way to do that is to run reset_running_stats() method on BatchNorm objects:
def drop_to_default():
for m in net.modules():
if type(m) == nn.BatchNorm2d:
m.reset_running_stats()
Below is this method's source code:
def reset_running_stats(self) -> None:
if self.track_running_stats:
# running_mean/running_var/num_batches... are registered at runtime depending
# if self.track_running_stats is on
self.running_mean.zero_() # Zero (neutral) mean
self.running_var.fill_(1) # One (neutral) variance
self.num_batches_tracked.zero_() # Number of batches tracked
You can see the source code here, _NormBase class.

Pytorch DataParallel doesn't work when the model contain tensor operation

If my model contains only nn.Module layers such as nn.Linear, nn.DataParallel works fine.
x = torch.randn(100,10)
class normal_model(torch.nn.Module):
def __init__(self):
super(normal_model, self).__init__()
self.layer = torch.nn.Linear(10,1)
def forward(self, x):
return self.layer(x)
model = normal_model()
model = nn.DataParallel(model.to('cuda:0'))
model(x)
However, when my model contains a tensor operation such as the following
class custom_model(torch.nn.Module):
def __init__(self):
super(custom_model, self).__init__()
self.layer = torch.nn.Linear(10,5)
self.weight = torch.ones(5,1, device='cuda:0')
def forward(self, x):
return self.layer(x) # self.weight
model = custom_model()
model = torch.nn.DataParallel(model.to('cuda:0'))
model(x)
It gives me the following error
RuntimeError: Caught RuntimeError in replica 1 on device 1. Original
Traceback (most recent call last): File
"/opt/conda/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py",
line 60, in _worker
output = module(*input, **kwargs) File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py",
line 541, in call
result = self.forward(*input, **kwargs) File "", line 7, in forward
return self.layer(x) # self.weight RuntimeError: arguments are located on different GPUs at
/pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:277
How to avoid this error when we have some tensor operations in our model?
I have no experience with DataParallel, but I think it might be because your tensor is not part of the model parameters. You can do this by writing:
torch.nn.Parameter(torch.ones(5,1))
Note that you don't have to move it to the gpu when initializing, because now when you call model.to('cuda:0') this is done automatically.
I can imagine that DataParallel uses the model parameters to move them to the appropriate gpu.
See this answer for more on the difference between a torch tensor and torch.nn.Parameter.
If you don't want the tensor values to be updated by backpropagation during training, you can add requires_grad=False.
Another way that might work is to override the to method, and initialize the tensor in the forward pass:
class custom_model(torch.nn.Module):
def __init__(self):
super(custom_model, self).__init__()
self.layer = torch.nn.Linear(10,5)
def forward(self, x):
return self.layer(x) # torch.ones(5,1, device=self.device)
def to(self, device: str):
new_self = super(custom_model, self).to(device)
new_self.device = device
return new_self
or something like this:
class custom_model(torch.nn.Module):
def __init__(self, device:str):
super(custom_model, self).__init__()
self.layer = torch.nn.Linear(10,5)
self.weight = torch.ones(5,1, device=device)
def forward(self, x):
return self.layer(x) # self.weight
def to(self, device: str):
new_self = super(custom_model, self).to(device)
new_self.device = device
new_self.weight = torch.ones(5,1, device=device)
return new_self
Adding to the answer from #Elgar de Groot since OP also wanted to freeze that layer. To do so you can still use torch.nn.Parameter but then you explicitly set requires_grad to false like this:
self.layer = torch.nn.Parameter(torch.ones(5,1))
self.layer.requires_grad = False

Batchsize in input shape of chainer CNN

I have a training set of 9957 images. The training set has shape (9957, 3, 60, 80).
Is batchsize required when putting training set to model?
If required can the original shape be considered correct for fitting to conv2D layer or do I need to add batchsize to input_shape?
X_train.shape
(9957, 60,80,3)
from chainer.datasets import split_dataset_random
from chainer.dataset import DatasetMixin
import numpy as np
class MyDataset(DatasetMixin):
def __init__(self, X, labels):
super(MyDataset, self).__init__()
self.X_ = X
self.labels_ = labels
self.size_ = X.shape[0]
def __len__(self):
return self.size_
def get_example(self, i):
return np.transpose(self.X_[i, ...], (2, 0, 1)), self.labels_[i]
batch_size = 3
label_train = y_trainHot1
dataset = MyDataset(X_train1, label_train)
dataset_train, valid = split_dataset_random(dataset, 8000, seed=0)
train_iter = iterators.SerialIterator(dataset_train, batch_size)
valid_iter = iterators.SerialIterator(valid, batch_size, repeat=False,
shuffle=False)
The code below tells you that you do not have to care the batch-size by yourself. You just use DatsetMixin and SerialIterator as is instructed in the tutorial of chainer.
from chainer.dataset import DatasetMixin
from chainer.iterators import SerialIterator
import numpy as np
NUM_IMAGES = 9957
NUM_CHANNELS = 3 # RGB
IMAGE_WIDTH = 60
IMAGE_HEIGHT = 80
NUM_CLASSES = 10
BATCH_SIZE = 32
TRAIN_SIZE = min(8000, int(NUM_IMAGES * 0.9))
images = np.random.rand(NUM_IMAGES, NUM_CHANNELS, IMAGE_WIDTH, IMAGE_HEIGHT)
labels = np.random.randint(0, NUM_CLASSES, (NUM_IMAGES,))
class MyDataset(DatasetMixin):
def __init__(self, images_, labels_):
# note: input arg.'s tailing underscore is just to avoid shadowing
super(MyDataset, self).__init__()
self.images_ = images_
self.labels_ = labels_
self.size_ = len(labels_)
def __len__(self):
return self.size_
def get_example(self, i):
return self.images_[i, ...], self.labels_[i]
dataset_train = MyDataset(images[:TRAIN_SIZE, ...], labels[:TRAIN_SIZE])
dataset_valid = MyDataset(images[TRAIN_SIZE:, ...], labels[TRAIN_SIZE:])
train_iter = SerialIterator(dataset_train, BATCH_SIZE)
valid_iter = SerialIterator(dataset_valid, BATCH_SIZE, repeat=False, shuffle=False)
###############################################################################
"""This block is just for the confirmation.
.. note: NOT recommended to call :func:`concat_examples` in your code.
Use :class:`chainer.updaters.StandardUpdater` instead.
"""
from chainer.dataset import concat_examples
batch_image, batch_label = concat_examples(next(train_iter))
print("batch_image.shape\n{}".format(batch_image.shape))
print("batch_label.shape\n{}".format(batch_label.shape))
Output
batch_image.shape
(32, 3, 60, 80)
batch_label.shape
(32,)
It should be noted that chainer.dataset.concat_example is a little bit tricky part. Usually, the users do not pay attention to this function, if you use StandardUpdater which conceals the native function chainer.dataset.concat_example.
Since chainer is designed on the scheme of Trainer, (Standard)Updater, some Optimizer, (Serial)Iterator and Dataset(Mixin), if you do not follow this scheme, you have to dive into the sea of chainer source code.

Resources