Error while running Convolutional Autoencoder RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn - pytorch

I am a noob and am creating a model in PyTorch for the first time. I am trying to create a convolutional autoencoder and am getting the error while running the model. The code I am using is:
class MyDataset(Dataset):
def __init__(self, image_paths, target_paths, train=True):
self.image_paths = image_paths
self.target_paths = target_paths
def transform(self, image, target):
# Transform to tensor
resize = transforms.Resize(size=(2350,1650))
image = resize(image)
target = resize(target)
grayscale = transforms.Grayscale(1)
image = grayscale(image)
target = grayscale(target)
image = TF.to_tensor(image)
target = TF.to_tensor(target)
return image, target
def __getitem__(self, index):
image = Image.open(self.image_paths[index])
target = Image.open(self.target_paths[index])
x, y = self.transform(image, target)
return x, y
def __len__(self):
return len(self.image_paths)
traindata = MyDataset(image_paths=train_data, target_paths=target_data, train=True)
testdata = MyDataset(image_paths=test_data, target_paths=None, train=False)
train_loader = DataLoader(traindata, batch_size=1, shuffle=True, num_workers=4)
test_loader = DataLoader(testdata, batch_size=1, shuffle=False, num_workers=4)
class ConvolutionalAutoEncoder(nn.Module):
def __init__(self):
super(ConvolutionalAutoEncoder, self).__init__()
self.encoder_block1 = nn.Sequential(
nn.Conv2d(1, 64, 3, stride=1, padding=1),
nn.ReLU(True),
nn.Conv2d(64, 64, 3, stride=1, padding=1),
nn.ReLU(True)
)
self.decoder_block1 = nn.Sequential(
nn.ConvTranspose2d(64, 64, 3, stride=1, padding=1),
nn.ReLU(True),
nn.ConvTranspose2d(64, 64, 3, stride=1, padding=1),
nn.ReLU(True)
)
self.decoder_block0 = nn.Sequential(
nn.ConvTranspose2d(64, 1, 3, stride=1, padding=1),
nn.Sigmoid()
)
def forward(self, x):
x1 = self.encoder_block1(x)
y1 = self.decoder_block1(x1)
y0 = self.decoder_block0(y1)
return x
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(device)
model = ConvolutionalAutoEncoder().to(device)
# Loss and optimizer
learning_rate = 0.001
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
params = list(model.parameters())
print(len(params))
print(params[0].size()) # conv1's .weight
num_epochs = 30
total_step = len(train_loader)
for epoch in range(num_epochs):
for batch_idx, data in enumerate(train_loader):
inp, targ = data
inp = inp.to(device)
targ = targ.to(device)
output = model(inp)
loss = criterion(output, targ)
model.zero_grad()
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (batch_idx+1) % 10 == 0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
The full error I am getting is:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-17-28fa0c94d845> in <module>
13
14 model.zero_grad()
---> 15 loss.backward()
16 optimizer.step()
17
~/anaconda3/envs/gautam_new/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
91 products. Defaults to ``False``.
92 """
---> 93 torch.autograd.backward(self, gradient, retain_graph, create_graph)
94
95 def register_hook(self, hook):
~/anaconda3/envs/gautam_new/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
88 Variable._execution_engine.run_backward(
89 tensors, grad_tensors, retain_graph, create_graph,
---> 90 allow_unreachable=True) # allow_unreachable flag
91
92
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Please help. Also, if possible also advice on how I can make my model deeper. I keep getting CUDA out of memory error.
Thanks.

I cannot test your model, but considering the error message it makes sense the cause of your problem lies in the return value of your forward.
Currently you are returning x which is your actual input not the output:
def forward(self, x):
x1 = self.encoder_block1(x)
y1 = self.decoder_block1(x1)
y0 = self.decoder_block0(y1)
return x
So to return the output you might want to change the return value form x to y0:
def forward(self, x):
x1 = self.encoder_block1(x)
y1 = self.decoder_block1(x1)
y0 = self.decoder_block0(y1)
return y0
About the memory: Please don't put too many issues in one question. Imagine you have three completely different issues in one question, and there are three people out there where each of them is able to solve one of your issues, you might end up with no answer at all. Because none of them is able to give you a complete answer addressing all of these issues. But if your split your issues into three questions, you might get just three answers, solving all your issues. In many cases it also can improve the question, because one can be more specific to the problem without writing an entire novel in the question.Of course if your issues are very related you can put them into one question, but this doesn't seem to be the case here. I guess there is still a slight chance your forward function had some side effects leading to the memory problem (wild guess - not sure at all about this). So if you're lucky it might solve your memory problem too, but if not you definitely should open a new question about it.

Related

CNN-LSTM for image sequences classification | high loss

I'm working on a project where I need to classify image sequences of some plants (growing over time). I tried implementing a CNN-LSTM with a pretrained ResNet18 as a feature extractor and then feeding those feature sequences to the LSTM.
The issue is that I'm not used to train LSTMs, and I'm afraid I'm doing something wrong. I made a clear architecture and everything seems ok, but the loss is not decreasing.
here's the architecture:
class RecurrentCNN(nn.Module):
def __init__(self, embed_dim, hidden_size, num_layers, num_classes):
super(RecurrentCNN, self).__init__()
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.num_layers = num_layers
self.num_classes = num_classes
self.cnn = torchvision.models.resnet18(weights='DEFAULT')
self.cnn.fc = nn.Sequential(
nn.Linear(in_features=512, out_features=self.embed_dim, bias=False),
nn.BatchNorm1d(num_features=self.embed_dim)
)
self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
self.fc = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.BatchNorm1d(num_features=hidden_size),
nn.Dropout(0.2),
nn.Linear(hidden_size, num_classes)
)
def forward(self, x):
batch_size, img_size = x.shape[0], x.shape[2:]
x = x.reshape(-1, *img_size) # i merge the batch_size and num_seq in order to feed everything to the cnn
x = self.cnn(x)
x = x.reshape(batch_size, -1, self.embed_dim) # then i comeback the original shape
# lstm part
h_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
c_0 = torch.autograd.Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).to(device)
x, (hn, cn) = self.lstm(x, (h_0, c_0))
x = x[:, -1, :]
x = self.fc(x)
return x
I have 40 classes to output. My sequences are of different lengths, so I was forced to pad with some black images sometimes! (mean seq length: 39, max: 55, min: 15)
I'm feeding the model with sequences of shape (batch_size, seq_len=55, 3, 112, 112).
It may be wrong but for now I just want to make sure that the model is at least working correctly, then I'll probably change the strategy of learning.
here's the training code:
EPOCHS = 10
BATCH_SIZE = 4
dataset = PlantDataset(data_path, max_sequence_len=55, transform=None)
train_loader = torch.utils.data.DataLoader(
dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True
)
rcnn = RecurrentCNN(embed_dim=128, hidden_size=256, num_layers=2, num_classes=len(class_list)).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(rcnn.parameters(), lr=0.0001)
loss_am = list() #AverageMeter()
rcnn.train()
for epoch in range(EPOCHS):
progress = tqdm(range(dataset.__len__() * BATCH_SIZE))
for i, data in enumerate(train_loader):
optimizer.zero_grad()
sequences, targets = data
sequences, targets = sequences.to(device, dtype=torch.float), torch.Tensor(targets).to(device)
output = torch.nn.functional.log_softmax(rcnn(sequences), dim=1)
loss_value = criterion(output, targets)
loss_value.backward()
optimizer.step()
with torch.no_grad():
loss_am.append(loss_value.item())
progress.update(i)
progress.set_description('Epoch: {}, Loss: {:.4f}'.format(epoch, loss_value.item()))
progress.close()
The loss on each batch goes like
3.53 => 4.22 => 4.62 => 3.83 => 3.75 => 3.80 => 3.70, etc
Do you have any idea ?
I am facing the same issue. But I am able to find the problem. Since I am using the Image-sequences dataset, my model is not able to predict the tokens, instead, I ended up with a whole set of garbage tokens. I am still trying to figure out why this is happening.

RuntimeError: Expected hidden[0] size (1, 1, 512), got (1, 128, 512) for LSTM pytorch

I trained the LSTM with a batch size of 128 and during testing my batch size is 1, why do I get this error? I'm suppose to initialize the hidden size when doing testing?
Here is the code that i'm using, I initialize the hidden state init_hidden function as (number_of_layers, batch_size, hidden_size) since batch_first=True
class ImageLSTM(nn.Module):
def __init__(self, n_inputs:int=49,
n_outputs:int=4096,
n_hidden:int=256,
n_layers:int=1,
bidirectional:bool=False):
"""
Takes a 1D flatten images.
"""
super(ImageLSTM, self).__init__()
self.n_inputs = n_inputs
self.n_hidden = n_hidden
self.n_outputs = n_outputs
self.n_layers = n_layers
self.bidirectional = bidirectional
self.lstm = nn.LSTM( input_size=self.n_inputs,
hidden_size=self.n_hidden,
num_layers=self.n_layers,
dropout = 0.5 if self.n_layers>1 else 0,
bidirectional=self.bidirectional,
batch_first=True)
if (self.bidirectional):
self.FC = nn.Sequential(
nn.Linear(self.n_hidden*2, self.n_outputs),
nn.Dropout(p=0.5),
nn.Sigmoid()
)
else:
self.FC = nn.Sequential(
nn.Linear(self.n_hidden, self.n_outputs),
# nn.Dropout(p=0.5),
nn.Sigmoid()
)
def init_hidden(self, batch_size, device=None): # input 4D tensor: (batch size, channels, width, height)
# initialize the hidden and cell state to zero
# vectors:(number of layer, batch size, number of hidden nodes)
if (self.bidirectional):
h0 = torch.zeros(2*self.n_layers, batch_size, self.n_hidden)
c0 = torch.zeros(2*self.n_layers, batch_size, self.n_hidden)
else:
h0 = torch.zeros(self.n_layers, batch_size, self.n_hidden)
c0 = torch.zeros(self.n_layers, batch_size, self.n_hidden)
if device is not None:
h0 = h0.to(device)
c0 = c0.to(device)
self.hidden = (h0,c0)
def forward(self, X): # X: tensor of shape (batch_size, channels, width, height)
# forward propagate LSTM
lstm_out, self.hidden = self.lstm(X, self.hidden) # lstm_out: tensor of shape (batch_size, seq_length, hidden_size)
# Decode the hidden state of the last time step
out = self.FC(lstm_out[:, -1, :])
return out
please edit your post and add code. How did you initialize the hidden-state? What does you model look like.
hidden[0] is not your hidden-size, its the hidden-state of the lstm. The shape of the hidden-state has to be initialized like this:
hidden = ( torch.zeros((batch_size, layers, hidden_size)), torch.zeros((layers, batch_size, hidden_size)) )
You seem to have done this correctly. But the error tells you that you gave a batch of size 1 (because as you said you want to test with only one sample) but the hidden-state is initialized with batch-size=128.
So I guess (please add code) that you hard-coded that the batch-size = 128. Dont do that. Since you have to reinitialize the hidden-state every forward pass you can do this:
...
def forward(self, x):
batch_size = x.shape[0]
hidden = (torch.zeros(self.layers, batch_size, self.hidden_size).to(device=device), torch.zeros(self.layers, batch_size, self.hidden_size).to(device=device))
output, hidden = lstm(x, hidden)
# then do what every you want with the output
I guess that this is what causes this error but please post your code, too!

Multi class classification - RuntimeError: 1D target tensor expected, multi-target not supported

My goal is to build a multi-class image classifier using Pytorch and based on the EMNIST dataset (black and white pictures of letters).
The shape of my training data X_train is (124800, 28, 28).
The shape of the original target variables y_train is (124800, 1), however I created a one-hot encoding so that now the shape is (124800, 26).
The model that I am building should have 26 output variables, each representing the probability of one letter.
I read in my data as follows:
import scipy .io
emnist = scipy.io.loadmat(DATA_DIR + '/emnist-letters.mat')
data = emnist ['dataset']
X_train = data ['train'][0, 0]['images'][0, 0]
X_train = X_train.reshape((-1,28,28), order='F')
y_train = data ['train'][0, 0]['labels'][0, 0]
Then, I created a one-hot-encoding as follows:
y_train_one_hot = np.zeros([len(y_train), 27])
for i in range (0, len(y_train)):
y_train_one_hot[i, y_train[i][0]] = 1
y_train_one_hot = np.delete(y_train_one_hot, 0, 1)
I create the dataset with:
train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train_one_hot))
batch_size = 128
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)
And then I build my model as follows:
class CNNModel(nn.Module):
def __init__(self):
super(CNNModel, self).__init__()
# Convolution 1
self.cnn1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5, stride=1, padding=0)
self.relu1 = nn.ReLU()
# Max pool 1
self.maxpool1 = nn.MaxPool2d(2,2)
# Convolution 2
self.cnn2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=0)
self.relu2 = nn.ReLU()
# Max pool 2
self.maxpool2 = nn.MaxPool2d(kernel_size=2)
# Fully connected 1 (readout)
self.fc1 = nn.Linear(32 * 4 * 4, 26)
def forward(self, x):
# Convolution 1
out = self.cnn1(x.float())
out = self.relu1(out)
# Max pool 1
out = self.maxpool1(out)
# Convolution 2
out = self.cnn2(out)
out = self.relu2(out)
# Max pool 2
out = self.maxpool2(out)
# Resize
# Original size: (100, 32, 7, 7)
# out.size(0): 100
# New out size: (100, 32*7*7)
out = out.view(out.size(0), -1)
# Linear function (readout)
out = self.fc1(out)
return out
model = CNNModel()
criterion = nn.CrossEntropyLoss()
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
And then I train the model as follows:
iter = 0
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Add a single channel dimension
# From: [batch_size, height, width]
# To: [batch_size, 1, height, width]
images = images.unsqueeze(1)
# Forward pass to get output/logits
outputs = model(images)
# Clear gradients w.r.t. parameters
optimizer.zero_grad()
# Forward pass to get output/logits
outputs = model(images)
# Calculate Loss: softmax --> cross entropy loss
loss = criterion(outputs, labels)
# Getting gradients w.r.t. parameters
loss.backward()
# Updating parameters
optimizer.step()
iter += 1
if iter % 500 == 0:
# Calculate Accuracy
correct = 0
total = 0
# Iterate through test dataset
for images, labels in test_loader:
images = images.unsqueeze(1)
# Forward pass only to get logits/output
outputs = model(images)
# Get predictions from the maximum value
_, predicted = torch.max(outputs.data, 1)
# Total number of labels
total += labels.size(0)
correct += (predicted == labels).sum()
accuracy = 100 * correct / total
# Print Loss
print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.data[0], accuracy))
However, when I run this, I get the following error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-11-c26c43bbc32e> in <module>()
21
22 # Calculate Loss: softmax --> cross entropy loss
---> 23 loss = criterion(outputs, labels)
24
25 # Getting gradients w.r.t. parameters
3 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
930 def forward(self, input, target):
931 return F.cross_entropy(input, target, weight=self.weight,
--> 932 ignore_index=self.ignore_index, reduction=self.reduction)
933
934
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2315 if size_average is not None or reduce is not None:
2316 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2317 return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
2318
2319
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
2113 .format(input.size(0), target.size(0)))
2114 if dim == 2:
-> 2115 ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2116 elif dim == 4:
2117 ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: 1D target tensor expected, multi-target not supported
I expect that I do something wrong when I initialize/use my loss function. What can I do so that I can start training my model?
If you are using crossentropy loss you shouldn't one-hot encode your target variable y.
Pytorch crossentropy expects just the class indices as target not their one-hot encoded version.
To cite the doc https://pytorch.org/docs/master/generated/torch.nn.CrossEntropyLoss.html :
This criterion expects a class index in the range [0, C-1] as the target for each value of a 1D tensor of size minibatch;

Maxpool2d error is showing while there is no Maxpool2d

I run the following code to train a neural network that contains a CNN with max pooling and two fully-connected layers:
class Net(nn.Module):
def __init__(self, vocab_size, embedding_size):
torch.manual_seed(0)
super(Net, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
self.conv1 = nn.Conv1d(embedding_size, 64, 3)
self.drop1 = nn.Dropout(0.5)
self.max_pool1 = nn.MaxPool1d(2)
self.flat1 = nn.Flatten()
self.fc1 = nn.Linear(64*99, 100)
self.fc2 = nn.Linear(100, 1)
def forward(self, sentence):
embedding = self.word_embeddings(sentence).permute(0, 2, 1)
conv1 = F.relu(self.conv1(embedding))
drop1 = self.drop1(conv1)
max_pool1 = self.max_pool1(drop1)
flat1 = self.flat1(max_pool1)
fc1 = F.relu(self.fc1(flat1))
fc2 = torch.sigmoid(self.fc2(fc1))
return fc2
net = Net(vocab_size, EMBEDDING_SIZE)
EPOCHS = 10
net.cuda()
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)
loader = DataLoader(train, batch_size=32)
net.train()
for epoch in range(EPOCHS):
progress = tqdm_notebook(loader, leave=False)
for inputs, target in progress:
net.zero_grad()
output = net(inputs.to(device))
loss = criterion(output, target.to(device))
loss.backward()
optimizer.step()
print(loss)
and I get the following error (the error trace has been updated and it includes the complete trace):
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py:498: UserWarning: Using a target size (torch.Size([32])) that is different to the input size (torch.Size([32, 1])) is deprecated. Please ensure they have the same size.
return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-2-3c8a885417ba> in <module>()
33 for inputs, target in progress:
34 net.zero_grad()
---> 35 output = net(inputs.to(device))
36 loss = criterion(output, target.to(device))
37 loss.backward()
5 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in _max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode, return_indices)
455 stride = torch.jit.annotate(List[int], [])
456 return torch.max_pool1d(
--> 457 input, kernel_size, stride, padding, dilation, ceil_mode)
458
459 max_pool1d = boolean_dispatch(
RuntimeError: max_pool2d_with_indices_out_cuda_frame failed with error code 0
I do not have any Maxpool2ds in my code! Could anybody help me with this problem?

PyTorch CNN never converges (implementation issue suspected)

I am having trouble getting this network to work as desired. I have tried so many iterations of this model and yet cannot get a reasonable error (it never fits, can’t even get it to overfit).
Where have I gone wrong? Any help would be greatly appreciated
For reference, there are 12 input ‘images’ (they’re actually water surface elevation at 9 stations in an estuary) of shape 49,9 and 12 labels of shape 1,9.
Full examples with data can be found at https://gitlab.com/jb4earth/effonn/
net = []
class Net(torch.nn.Module):
def __init__(self, kernel_size):
super(Net, self).__init__()
mid_size = (49*49*9)
self.predict = torch.nn.Sequential(
nn.Conv2d(
in_channels=1,
out_channels=mid_size,
kernel_size=kernel_size,
stride=1,
padding=(0, 0)
),
nn.ReLU(),
nn.MaxPool2d(1),
nn.ReLU(),
nn.Conv2d(
in_channels=mid_size,
out_channels=1,
kernel_size=kernel_size,
stride=1,
padding=(0, 0)
),
nn.ReLU()
)
def forward(self, x):
x = self.predict(x)
return x
def train_network(x,y,optimizer,loss_func):
prediction = net(x)
loss = loss_func(prediction, y.squeeze())
optimizer.zero_grad()
loss.backward()
optimizer.step()
return prediction, loss
net = Net((1,1))
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
loss_func = torch.nn.MSELoss()
cnt = 0
t = True
while t == True:
# get_xy in place of DataLoader
(x,y) = get_xy(input_data,output_data,cnt)
# x.shape is 1,1,49,9
# y.shape is 1,1,1,9
# train and predict
(prediction,loss) = train_network(x,y,optimizer,loss_func)
# prediction shape different than desired so averaging all results
prediction_ = torch.mean(prediction)
# only 12 IO's so loop through
cnt += 1
if cnt > 11:
cnt = 0
take a look here, this looks suspicious. you are calculating the loss and then making the gradients zeros. calling zero grad should be called before calculating the loss. So you need to switch the optimizer.zero_grad() to the top and I assume it will work. I couldn't reproduce your example that's why I'm guessing this is your Error.
loss = loss_func(prediction, y.squeeze())
optimizer.zero_grad() # switch this to the top
loss.backward()
optimizer.step()

Resources