Pytorch Convolutional Autoencoders - pytorch

How one construct decoder part of convolutional autoencoder? Suppose I have this
(input -> conv2d -> maxpool2d -> maxunpool2d -> convTranspose2d -> output):
# CIFAR images shape = 3 x 32 x 32
class ConvDAE(nn.Module):
def __init__(self):
# input: batch x 3 x 32 x 32 -> output: batch x 16 x 16 x 16
self.encoder = nn.Sequential(
nn.Conv2d(3, 16, 3, stride=1, padding=1), # batch x 16 x 32 x 32
nn.MaxPool2d(2, stride=2) # batch x 16 x 16 x 16
# input: batch x 16 x 16 x 16 -> output: batch x 3 x 32 x 32
self.decoder = nn.Sequential(
# this line does not work
# nn.MaxUnpool2d(2, stride=2, padding=0), # batch x 16 x 32 x 32
nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1, output_padding=1), # batch x 16 x 32 x 32
nn.ConvTranspose2d(16, 3, 3, stride=1, padding=1, output_padding=0), # batch x 3 x 32 x 32
def forward(self, x):
out = self.encoder(x)
out = self.decoder(out)
return out
Pytorch specific question: why can't I use MaxUnpool2d in decoder part. This gives me the following error:
TypeError: forward() missing 1 required positional argument: 'indices'
And the conceptual question: Shouldn't we do in decoder inverse of whatever we did in encoder? I saw some implementations and it seems they only care about the dimensions of input and output of decoder. Here and here are some examples.

For the torch part of the question, unpool modules have as a required positional argument the indices returned from the pooling modules which will be returned with return_indices=True. So you could do
class ConvDAE(nn.Module):
def __init__(self):
# input: batch x 3 x 32 x 32 -> output: batch x 16 x 16 x 16
self.encoder = nn.Sequential(
nn.Conv2d(3, 16, 3, stride=1, padding=1), # batch x 16 x 32 x 32
nn.MaxPool2d(2, stride=2, return_indices=True)
self.unpool = nn.MaxUnpool2d(2, stride=2, padding=0)
self.decoder = nn.Sequential(
nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1, output_padding=1),
nn.ConvTranspose2d(16, 3, 3, stride=1, padding=1, output_padding=0),
def forward(self, x):
out, indices = self.encoder(x)
out = self.unpool(out, indices)
out = self.decoder(out)
return out
As for the general part of the question, I don't think state of the art is to use a symmetric decoder part, as it has been shown that devonvolution/transposed convolution produces checkerboard effects and many approaches tend to use upsampling modules instead. You will find more info faster through PyTorch channels.


Convolutional Autoencoder outputs black images

I need to train an autoencoder on Adaptiope dataset. I am using a ResNet18 backbone for my encoder part.
The issue I encounter is that even after many epochs, the reconstructed image is always completely black.
On the other hand, when I use a simpler Autoencoder without the resnet18 backbone, reconstructed images turn out close to what I need them to be.
I am trying to understand why is this the case. I am a novice in the field and still cannot grasp the problem. It looks like an architectural problem but I cannot wrap my head around it.
This is my "vanilla" Encoder, with no resnet18 backbone:
class Encoder(nn.Module):
def __init__(self,
num_input_channels : int,
base_channel_size : int,
latent_dim : int
- num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
- base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
- latent_dim : Dimensionality of latent representation z
- act_fn : Activation function used throughout the encoder network
c_hid = base_channel_size
self.layer1 = nn.Sequential(nn.Conv2d(num_input_channels, c_hid, kernel_size=3, padding=1, stride=2), # 32x32 => 16x16
nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
nn.Conv2d(c_hid, 2*c_hid, kernel_size=3, padding=1, stride=2), # 16x16 => 8x8
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1),
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1, stride=2), # 8x8 => 4x4
nn.Flatten(), # Image grid to single feature vector
nn.Linear(351232, latent_dim))
self.linear2 = nn.Linear(latent_dim, 20*8)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
enc = self.layer1(x)
lin_p = self.linear2(enc)
p = self.softmax(lin_p)
return enc, p
This is the Encoder with Resnet18 backbone:
class Encoder(nn.Module):
def __init__(self,
num_input_channels : int,
base_channel_size : int,
latent_dim : int
- num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
- base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
- latent_dim : Dimensionality of latent representation z
- act_fn : Activation function used throughout the encoder network
c_hid = base_channel_size
self.fc_hidden1, self.fc_hidden2, self.CNN_embed_dim = 224, 768, 224
# CNN architechtures
self.ch1, self.ch2, self.ch3, self.ch4 = 16, 32, 64, 128
self.k1, self.k2, self.k3, self.k4 = (5, 5), (3, 3), (3, 3), (3, 3) # 2d kernel size
self.s1, self.s2, self.s3, self.s4 = (2, 2), (2, 2), (2, 2), (2, 2) # 2d strides
self.pd1, self.pd2, self.pd3, self.pd4 = (0, 0), (0, 0), (0, 0), (0, 0) # 2d padding
# encoding components
model = models.resnet18(pretrained=True)
for param in model.parameters():
param.requires_grad = False
modules = list(model.children())[:-1] # delete the last fc layer.
self.resnet = nn.Sequential(*modules)
self.fc1 = nn.Linear(model.fc.in_features, self.fc_hidden1)
self.bn1 = nn.BatchNorm1d(self.fc_hidden1, momentum=0.01)
self.relu = nn.ReLU(inplace=True)
self.layer = nn.Sequential(
nn.Flatten(), # Image grid to single feature vector
nn.Linear(224, latent_dim)) #8x224
#self.flatten = nn.Flatten(), # Image grid to single feature vector
#self.linear1 = nn.Linear(351232, latent_dim)
self.linear2 = nn.Linear(latent_dim, 20*8)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
x = self.resnet(x)
x = x.reshape(x.shape[0], 512)
x = self.fc1(x)
x = self.bn1(x)
x = self.relu(x)
enc = self.layer(x)
#x = self.fc2(x)
#x =
# enc = self.layer1(x)
lin_p = self.linear2(enc)
p = self.softmax(lin_p)
return enc, p
The decoder is the same for both.
class Decoder_N(nn.Module):
def __init__(self,
num_input_channels : int,
base_channel_size : int,
latent_dim : int,
act_fn : object = nn.GELU):
- num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
- base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
- latent_dim : Dimensionality of latent representation z
- act_fn : Activation function used throughout the decoder network
c_hid = 224
self.linear = nn.Sequential(
nn.Linear(latent_dim, 351232),
) = nn.Sequential(
nn.ConvTranspose2d(2*c_hid, 2*c_hid, kernel_size=3, output_padding=1, padding=1, stride=2), # 4x4 => 8x8
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1),
nn.ConvTranspose2d(2*c_hid, c_hid, kernel_size=3, output_padding=1, padding=1, stride=2), # 8x8 => 16x16
nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
nn.ConvTranspose2d(c_hid, 3, kernel_size=3, output_padding=1, padding=1, stride=2), # 16x16 => 32x32
nn.Tanh() # The input images is scaled between -1 and 1, hence the output has to be bounded as well
def forward(self, x):
x = self.linear(x)
x = x.reshape(x.shape[0], -1, 28, 28)
x =
return x
num_input_channels : 224,
base_channel_size : 3
latent_dim : 64
I expected the "advanced" autoencoder to extract my features better, but apparently this is not the case.
I solved the issue: there were issues with the normalization of images and the BatchNorm layer. I accidentally used mean and std of ImageNet for the dataset instead of the correct ones. Additionally, during training I forgot to add regularizers for the different components of my loss, leading my model to learn literally nothing.

torch.nn.functional.binary_cross_entropy and torch.nn.BCEloss() difference

I am trying to train a GAN model on anime face Dataset to generate anime faces. Here's my code-
from import DataLoader
from torchvision.datasets import ImageFolder
import torchvision.transforms as T
import os
import torch
import torch.nn as nn
from torchvision.utils import make_grid
import matplotlib.pyplot as plt
%matplotlib inline
def denorm(img_tensors):
return img_tensors * stats[1][0] + stats[0][0]
def show_images(images, nmax=64):
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_xticks([]); ax.set_yticks([])
ax.imshow(make_grid(denorm(images.detach()[:nmax]), nrow=8).permute(1, 2, 0))
def show_batch(dl, nmax=64):
for images, _ in dl:
show_images(images, nmax)
def get_default_device():
"""Pick GPU if available, else CPU"""
if torch.cuda.is_available():
return torch.device('cuda')
return torch.device('cpu')
def to_device(data, device):
"""Move tensor(s) to chosen device"""
if isinstance(data, (list,tuple)):
return [to_device(x, device) for x in data]
return, non_blocking=True)
class DeviceDataLoader():
"""Wrap a dataloader to move data to a device"""
def __init__(self, dl, device):
self.dl = dl
self.device = device
def __iter__(self):
"""Yield a batch of data after moving it to device"""
for b in self.dl:
yield to_device(b, self.device)
def __len__(self):
"""Number of batches"""
return len(self.dl)
device = get_default_device()
train_dl = DeviceDataLoader(train_dl, device)
discriminator = nn.Sequential(
# in: 3 x 64 x 64
nn.Conv2d(3, 64, kernel_size=4, stride=2, padding=1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# out: 64 x 32 x 32
nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# out: 128 x 16 x 16
nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# out: 256 x 8 x 8
nn.Conv2d(256, 512, kernel_size=4, stride=2, padding=1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
# out: 512 x 4 x 4
nn.Conv2d(512, 1, kernel_size=4, stride=1, padding=0, bias=False),
# out: 1 x 1 x 1
discriminator = to_device(discriminator, device)
latent_size = 128
generator = nn.Sequential(
# in: latent_size x 1 x 1
nn.ConvTranspose2d(latent_size, 512, kernel_size=4, stride=1, padding=0, bias=False),
# out: 512 x 4 x 4
nn.ConvTranspose2d(512, 256, kernel_size=4, stride=2, padding=1, bias=False),
# out: 256 x 8 x 8
nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1, bias=False),
# out: 128 x 16 x 16
nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1, bias=False),
# out: 64 x 32 x 32
nn.ConvTranspose2d(64, 3, kernel_size=4, stride=2, padding=1, bias=False),
# out: 3 x 64 x 64
xb = torch.randn(batch_size, latent_size, 1, 1) # random latent tensors
fake_images = generator(xb)
generator = to_device(generator, device)
def train_discriminator(real_images, opt_d):
# Clear discriminator gradients
# Pass real images through discriminator
real_preds = discriminator(real_images)
real_targets = torch.ones(real_images.size(0), 1, device=device)
real_loss = F.binary_cross_entropy(real_preds, real_targets) # here nn.BCELoss() not working
real_score = torch.mean(real_preds).item()
# Generate fake images
latent = torch.randn(batch_size, latent_size, 1, 1, device=device)
fake_images = generator(latent)
# Pass fake images through discriminator
fake_targets = torch.zeros(fake_images.size(0), 1, device=device)
fake_preds = discriminator(fake_images)
fake_loss = F.binary_cross_entropy(fake_preds, fake_targets) # here nn.BCELoss() not working
fake_score = torch.mean(fake_preds).item()
# Update discriminator weights
loss = real_loss + fake_loss
return loss.item(), real_score, fake_score
def train_generator(opt_g):
# Clear generator gradients
# Generate fake images
latent = torch.randn(batch_size, latent_size, 1, 1, device=device)
fake_images = generator(latent)
# Try to fool the discriminator
preds = discriminator(fake_images)
targets = torch.ones(batch_size, 1, device=device)
loss = F.binary_cross_entropy(preds, targets) # here nn.BCELoss() not working
# Update generator weights
return loss.item()
from torchvision.utils import save_image
sample_dir = 'generated'
os.makedirs(sample_dir, exist_ok=True)
def save_samples(index, latent_tensors, show=True):
fake_images = generator(latent_tensors)
fake_fname = 'generated-images-{0:0=4d}.png'.format(index)
save_image(denorm(fake_images), os.path.join(sample_dir, fake_fname), nrow=8)
print('Saving', fake_fname)
if show:
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_xticks([]); ax.set_yticks([])
ax.imshow(make_grid(fake_images.cpu().detach(), nrow=8).permute(1, 2, 0))
fixed_latent = torch.randn(64, latent_size, 1, 1, device=device)
save_samples(0, fixed_latent)
from tqdm.notebook import tqdm
import torch.nn.functional as F
def fit(epochs, lr, start_idx=1):
# Losses & scores
losses_g = []
losses_d = []
real_scores = []
fake_scores = []
# Create optimizers
opt_d = torch.optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))
opt_g = torch.optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
for epoch in range(epochs):
for real_images, _ in tqdm(train_dl):
# Train discriminator
loss_d, real_score, fake_score = train_discriminator(real_images, opt_d)
# Train generator
loss_g = train_generator(opt_g)
# Record losses & scores
# Log losses & scores (last batch)
print("Epoch [{}/{}], loss_g: {:.4f}, loss_d: {:.4f}, real_score: {:.4f}, fake_score: {:.4f}".format(
epoch+1, epochs, loss_g, loss_d, real_score, fake_score))
# Save generated images
save_samples(epoch+start_idx, fixed_latent, show=False)
return losses_g, losses_d, real_scores, fake_scores
lr = 0.0002
epochs = 95
history = fit(epochs, lr)
The above code is working fine but before I was using nn.BCELoss from torch instead of binary_cross_entropy from torch.nn.functional in 'train_generator()' and 'train_discriminator()' methods above and I was getting the following error,
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
I wonder if they both don't perform the same operation. Can you help me to understand the problem?
nn.BCELoss is a class. Unlike nn.functional.binary_cross_entropy, you have to instantiate it first before using it to calculate the loss. In you case,
F.binary_cross_entropy(preds, targets)
is equivalent to
nn.BCELoss()(preds, targets)

Runtime Error: mat1 and mat2 shapes cannot be multiplied (62x2304 and 1568x3)

I am un able to find error input 32*32 gray images:
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Sequential(
in_channels=1, # gray-scale images
kernel_size=5, # 5x5 convolutional kernel
stride=1, #no. of pixels pass at a time
padding=2, # to preserve size of input image
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, 5, 1, 2),
# fully connected layers
self.out = nn.Linear(32*7*7, 3)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
# flatten the output of conv2
x = x.view(x.size(0), -1)
output = self.out(x)
return output
Your linear layer expects input of size 32x7x7. Given that your conv1 and conv2 layers performs max pooling with stride=2, that means your network is configured for input size of 28x28 (MNIST usual input size) and not 32x32 as you expect.
Moreover, considering the values in your error message (64x2304) I assume you are working with batch_size=64, but your images are NOT 32x32, but rather 32x?? which is slightly larger than 32, resulting with a feature map of 32x8x9 after the pooling.

PyTorch convolutional block - CIFAR10 - RuntimeError

I am using PyTorch 1.7 and Python 3.8 with CIFAR-10 dataset. I am trying to create a block with: conv -> conv -> pool -> fc. Fully connected layer (fc) has 256 neurons. The code for this is as follows:
# Testing-
conv1 = nn.Conv2d(
in_channels = 3, out_channels = 64,
kernel_size = 3, stride = 1,
padding = 1, bias = True
conv2 = nn.Conv2d(
in_channels = 64, out_channels = 64,
kernel_size = 3, stride = 1,
padding = 1, bias = True
pool = nn.MaxPool2d(
kernel_size = 2, stride = 2
fc1 = nn.Linear(
in_features = 64 * 16 * 16, out_features = 256
bias = True
# torch.Size([32, 3, 32, 32])
x = conv1(images)
# torch.Size([32, 64, 32, 32])
x = conv2(x)
# torch.Size([32, 64, 32, 32])
x = pool(x)
# torch.Size([32, 64, 16, 16])
# This line of code gives error-
x = fc1(x)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (32768x16 and
What is going wrong?
You are nearly there! As you will have noticed nn.MaxPool returns a shape (32, 64, 16, 16) which is incompatible with a nn.Linear's input: a 2D dimensional tensor (batch, in_features). You need to broadcast to (batch, 64*16*16).
I would recommend using a nn.Flatten layer rather than broadcasting yourself. It will act as x.view(x.size(0), -1) but is clearer. By default it preserves the first dimension:
conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
conv2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
pool = nn.MaxPool2d(kernel_size=2, stride=2)
flatten = nn.Flatten()
fc1 = nn.Linear(in_features=64*16*16, out_features=256)
x = conv1(images)
x = conv2(x)
x = pool(x)
x = flatten(x)
x = fc1(x)
Alternatively, you could use the functional alternative torch.flatten, where you will have to provide the start_dim as 1: x = torch.flatten(x, start_dim=1).
When you're done debugging, you could assemble your layers with nn.Sequential:
model = nn.Sequential(
nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Linear(in_features=64*16*16, out_features=256)
x = model(images)
you need to flat the output of nn.MaxPool2d layer for giving input in nn.Linear layer.
try to use x = x.view(x.size(0), -1) before giving input to fc layer for flatten tensor.

`*** RuntimeError: mat1 dim 1 must match mat2 dim 0` whenever I run model(images)

def __init__(self):
self.conv = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=5, stride=2, bias=False),
nn.Conv2d(64, 64, kernel_size=3, stride=2, bias=False),
nn.Conv2d(64, 64, kernel_size=3, stride=2, bias=False),
How can I deal with this error? I think the error is with self.fc, but I can't say how to fix it.
The output from self.conv(x) is of shape torch.Size([32, 64, 2, 2]): 32*64*2*2= 8192 (this is equivalent to (self.conv_out_size). The input to fully connected layer expects a single dimension vector i.e. you need to flatten it before passing to a fully connected layer in the forward function.
class Network():
def foward():
conv_out = self.conv(x)
conv_out = conv_out.view(-1, 32*64*2*2)
x = self.fc(conv_out)
return x
torch.Size([32, 64, 2, 2])
torch.Size([1, 8192])
I think you're using self._get_conv_out function wrong.
It should be
def _get_conv_out(self, shape):
output = self.conv(torch.zeros(1, *shape)) # not (32, *size)
return int(
then, in the forward pass, you can use
conv_out = self.conv(x)
# flatten the output of conv layers
conv_out = conv_out.view(conv_out.size(0), -1)
x = self.fc(conv_out)
For an input of (32, 1, 110, 110), the output should be torch.Size([32, 2]).
I had the same problem however I have solved it by using a batch of 32 and tensor size of [3, 32, 32] for my images and the following configurations on my model. I am using ResNet with 9 CNN and looking for 4 outputs.
transform = transforms.Compose([transforms.Resize((32, 32)), transforms.ToTensor()])
def conv_block(in_channels, out_channels, pool=False):
layers = [nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
if pool: layers.append(nn.MaxPool2d(2))
return nn.Sequential(*layers)
class ResNet9(ImageClassificationBase):
def __init__(self, in_channels, num_classes):
self.conv1 = conv_block(in_channels, 64)
self.conv2 = conv_block(64, 128, pool=True)
self.res1 = nn.Sequential(conv_block(128, 128), conv_block(128, 128))
self.conv3 = conv_block(128, 256, pool=True)
self.conv4 = conv_block(256, 512, pool=True)
self.res2 = nn.Sequential(conv_block(512, 512), conv_block(512, 512))
self.classifier = nn.Sequential(nn.MaxPool2d(4),
nn.Linear(512, num_classes))
def forward(self, xb):
out = self.conv1(xb)
out = self.conv2(out)
out = self.res1(out) + out
out = self.conv3(out)
out = self.conv4(out)
out = self.res2(out) + out
out = self.classifier(out)
return out
