I've read in the documentation that for nn.ConvTranspose2d the output dimensions are calculated as follows:
H_out = (H_in−1)*stride[0] − 2×padding[0] +
dilation[0]×(kernel_size[0]−1) + output_padding[0] + 1
W_out = (Win−1)×stride1 − 2×padding1 +
dilation1×(kernel_size1−1) + output_padding1 + 1
I just want, starting with noise of shape (64,128) to arrive at a final image of shape (64,1,103,8), (batch, channel, height, width).
I've done the following:
class Generator(nn.Module):
def __init__(self, z_dim):
super(Generator, self).__init__()
self.z_dim = z_dim
self.lin1 = nn.Linear(z_dim, 6144)
self.gen = nn.Sequential(
self._block(in_channels=256, out_channels=128, kernel_size=(5,4),stride=(2,2),padding=(2,1)),
self._block(in_channels=128, out_channels=128, kernel_size=(4,3), stride=(2,1),padding=(1,1)),
self._block(in_channels=128, out_channels=64, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
self._block(in_channels=64, out_channels=64, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
self._block(in_channels=64, out_channels=64, kernel_size=(3,2), stride=(2,2), padding=(1,4)),
self._block(in_channels=64, out_channels=32, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
nn.ConvTranspose2d(32, 1, (3,3))
)
In general, I'm perplexed about how to calibrate inner dimensions in order to arrive at a desired one.
Related
from torch import nn
from torchsummary import summary
class CNNNetwork(nn.Module):
def __init__(self):
super().__init__()
# 4 conv blocks / flatten / linear / softmax
self.conv1 = nn.Sequential(
nn.Conv2d(
in_channels=1,
out_channels=16,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.conv2 = nn.Sequential(
nn.Conv2d(
in_channels=16,
out_channels=32,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.conv3 = nn.Sequential(
nn.Conv2d(
in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
padding=2
),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.flatten = nn.Flatten()
self.linear = nn.Linear(64*8*11, 5)
self.softmax = nn.Softmax(dim=1)
def forward(self, input_data):
x = self.conv1(input_data)
x = self.conv2(x)
x = self.conv3(x)
x = self.flatten(x)
logits = self.linear(x)
predictions = self.softmax(logits)
return predictions
if __name__ == "__main__":
cnn = CNNNetwork()
cnn.to(device=dml)
summary(cnn,(1,11,8),device=dml)
in this code:
logits = self.linear(x) is having some problem to generate summary
there are no error messages but only this
enter image description here
i want to make summary of model and later train a audio classifier where mfcc size is (30,281)
And also if you just explain to me how input for linear fn is properly calculated
I need to train an autoencoder on Adaptiope dataset. I am using a ResNet18 backbone for my encoder part.
The issue I encounter is that even after many epochs, the reconstructed image is always completely black.
On the other hand, when I use a simpler Autoencoder without the resnet18 backbone, reconstructed images turn out close to what I need them to be.
I am trying to understand why is this the case. I am a novice in the field and still cannot grasp the problem. It looks like an architectural problem but I cannot wrap my head around it.
This is my "vanilla" Encoder, with no resnet18 backbone:
`
class Encoder(nn.Module):
def __init__(self,
num_input_channels : int,
base_channel_size : int,
latent_dim : int
):
"""
Inputs:
- num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
- base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
- latent_dim : Dimensionality of latent representation z
- act_fn : Activation function used throughout the encoder network
"""
super().__init__()
c_hid = base_channel_size
self.layer1 = nn.Sequential(nn.Conv2d(num_input_channels, c_hid, kernel_size=3, padding=1, stride=2), # 32x32 => 16x16
nn.ReLU(),
nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(c_hid, 2*c_hid, kernel_size=3, padding=1, stride=2), # 16x16 => 8x8
nn.ReLU(),
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1),
nn.ReLU(),
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1, stride=2), # 8x8 => 4x4
nn.ReLU(),
nn.Flatten(), # Image grid to single feature vector
nn.Linear(351232, latent_dim))
self.linear2 = nn.Linear(latent_dim, 20*8)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
enc = self.layer1(x)
lin_p = self.linear2(enc)
p = self.softmax(lin_p)
return enc, p
This is the Encoder with Resnet18 backbone:
class Encoder(nn.Module):
def __init__(self,
num_input_channels : int,
base_channel_size : int,
latent_dim : int
):
"""
Inputs:
- num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
- base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
- latent_dim : Dimensionality of latent representation z
- act_fn : Activation function used throughout the encoder network
"""
super().__init__()
c_hid = base_channel_size
self.fc_hidden1, self.fc_hidden2, self.CNN_embed_dim = 224, 768, 224
# CNN architechtures
self.ch1, self.ch2, self.ch3, self.ch4 = 16, 32, 64, 128
self.k1, self.k2, self.k3, self.k4 = (5, 5), (3, 3), (3, 3), (3, 3) # 2d kernel size
self.s1, self.s2, self.s3, self.s4 = (2, 2), (2, 2), (2, 2), (2, 2) # 2d strides
self.pd1, self.pd2, self.pd3, self.pd4 = (0, 0), (0, 0), (0, 0), (0, 0) # 2d padding
# encoding components
model = models.resnet18(pretrained=True)
for param in model.parameters():
param.requires_grad = False
modules = list(model.children())[:-1] # delete the last fc layer.
self.resnet_modules=modules
self.resnet = nn.Sequential(*modules)
self.fc1 = nn.Linear(model.fc.in_features, self.fc_hidden1)
self.bn1 = nn.BatchNorm1d(self.fc_hidden1, momentum=0.01)
self.relu = nn.ReLU(inplace=True)
self.layer = nn.Sequential(
nn.Flatten(), # Image grid to single feature vector
nn.Linear(224, latent_dim)) #8x224
#self.flatten = nn.Flatten(), # Image grid to single feature vector
#self.linear1 = nn.Linear(351232, latent_dim)
self.linear2 = nn.Linear(latent_dim, 20*8)
self.softmax = nn.Softmax(dim=-1)
def forward(self, x):
x = self.resnet(x)
x = x.reshape(x.shape[0], 512)
x = self.fc1(x)
x = self.bn1(x)
x = self.relu(x)
enc = self.layer(x)
#x = self.fc2(x)
#x = self.bn(x)
# enc = self.layer1(x)
lin_p = self.linear2(enc)
p = self.softmax(lin_p)
return enc, p
The decoder is the same for both.
class Decoder_N(nn.Module):
def __init__(self,
num_input_channels : int,
base_channel_size : int,
latent_dim : int,
act_fn : object = nn.GELU):
"""
Inputs:
- num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
- base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
- latent_dim : Dimensionality of latent representation z
- act_fn : Activation function used throughout the decoder network
"""
super().__init__()
c_hid = 224
self.linear = nn.Sequential(
nn.Linear(latent_dim, 351232),
nn.ReLU()
)
self.net = nn.Sequential(
nn.ConvTranspose2d(2*c_hid, 2*c_hid, kernel_size=3, output_padding=1, padding=1, stride=2), # 4x4 => 8x8
nn.ReLU(),
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1),
nn.ReLU(),
nn.ConvTranspose2d(2*c_hid, c_hid, kernel_size=3, output_padding=1, padding=1, stride=2), # 8x8 => 16x16
nn.ReLU(),
nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
nn.ReLU(),
nn.ConvTranspose2d(c_hid, 3, kernel_size=3, output_padding=1, padding=1, stride=2), # 16x16 => 32x32
nn.Tanh() # The input images is scaled between -1 and 1, hence the output has to be bounded as well
)
def forward(self, x):
x = self.linear(x)
x = x.reshape(x.shape[0], -1, 28, 28)
x = self.net(x)
return x
`
num_input_channels : 224,
base_channel_size : 3
latent_dim : 64
I expected the "advanced" autoencoder to extract my features better, but apparently this is not the case.
I solved the issue: there were issues with the normalization of images and the BatchNorm layer. I accidentally used mean and std of ImageNet for the dataset instead of the correct ones. Additionally, during training I forgot to add regularizers for the different components of my loss, leading my model to learn literally nothing.
I run the code to show the latent space images, but the colour of the image is not displayed as RGB. If you know, would you tell me what is the cause of this result? I would like to output each image like
The problem code and result are
random_latent_vectors = tf.random.normal(shape=(10, 128))
generator = make_generator(128)
images = generator(random_latent_vectors)
images *= 255
images = images.numpy()
images.shape
plt.figure(figsize=(8,3))
for i in range(images.shape[0]):
plt.subplot(2, 5, i+1)
plt.imshow(images[i, :, :, 0].astype("int32"))
plt.axis('off')
plt.show()
generator is built from
def make_generator(latent_dim):
model = keras.Sequential([
keras.Input(shape=(latent_dim,)),
layers.Dense(8 * 8 * 128),
layers.Reshape((8, 8, 128)),
layers.Conv2DTranspose(128, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2DTranspose(256, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2DTranspose(512, kernel_size=4, strides=2, padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(3, kernel_size=5, padding="same", activation="sigmoid")
])
return model
def __init__(self):
super().__init__()
self.conv = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=5, stride=2, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=2, bias=False),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, stride=2, bias=False),
nn.BatchNorm2d(64),
)
How can I deal with this error? I think the error is with self.fc, but I can't say how to fix it.
The output from self.conv(x) is of shape torch.Size([32, 64, 2, 2]): 32*64*2*2= 8192 (this is equivalent to (self.conv_out_size). The input to fully connected layer expects a single dimension vector i.e. you need to flatten it before passing to a fully connected layer in the forward function.
i.e.
class Network():
...
def foward():
...
conv_out = self.conv(x)
print(conv_out.shape)
conv_out = conv_out.view(-1, 32*64*2*2)
print(conv_out.shape)
x = self.fc(conv_out)
return x
output
torch.Size([32, 64, 2, 2])
torch.Size([1, 8192])
EDIT:
I think you're using self._get_conv_out function wrong.
It should be
def _get_conv_out(self, shape):
output = self.conv(torch.zeros(1, *shape)) # not (32, *size)
return int(numpy.prod(output.size()))
then, in the forward pass, you can use
conv_out = self.conv(x)
# flatten the output of conv layers
conv_out = conv_out.view(conv_out.size(0), -1)
x = self.fc(conv_out)
For an input of (32, 1, 110, 110), the output should be torch.Size([32, 2]).
I had the same problem however I have solved it by using a batch of 32 and tensor size of [3, 32, 32] for my images and the following configurations on my model. I am using ResNet with 9 CNN and looking for 4 outputs.
transform = transforms.Compose([transforms.Resize((32, 32)), transforms.ToTensor()])
def conv_block(in_channels, out_channels, pool=False):
layers = [nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)]
if pool: layers.append(nn.MaxPool2d(2))
return nn.Sequential(*layers)
class ResNet9(ImageClassificationBase):
def __init__(self, in_channels, num_classes):
super().__init__()
self.conv1 = conv_block(in_channels, 64)
self.conv2 = conv_block(64, 128, pool=True)
self.res1 = nn.Sequential(conv_block(128, 128), conv_block(128, 128))
self.conv3 = conv_block(128, 256, pool=True)
self.conv4 = conv_block(256, 512, pool=True)
self.res2 = nn.Sequential(conv_block(512, 512), conv_block(512, 512))
self.classifier = nn.Sequential(nn.MaxPool2d(4),
nn.Flatten(),
nn.Dropout(0.2),
nn.Linear(512, num_classes))
def forward(self, xb):
out = self.conv1(xb)
out = self.conv2(out)
out = self.res1(out) + out
out = self.conv3(out)
out = self.conv4(out)
out = self.res2(out) + out
out = self.classifier(out)
return out
I am trying to understand why my classifier has a dimension issue. Here is my code:
class convnet(nn.Module):
def __init__(self, num_classes=1000):
super(convnet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.BatchNorm2d(32),
nn.MaxPool2d(kernel_size=2, stride = 2),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.BatchNorm2d(32),
nn.MaxPool2d(kernel_size=2, stride = 2), #stride=2),
nn.Conv2d(32, 64, kernel_size=3, stride=1),
nn.ReLU(inplace=True),
nn.BatchNorm2d(64),
nn.MaxPool2d(kernel_size=2, stride = 2),
)
self.classifier = nn.Sequential(
nn.Linear(576, 128),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Linear(128, 64),
nn.ReLU(inplace=True),
nn.BatchNorm2d(64),
nn.Linear(64,num_classes),
nn.Softmax(),
)
def forward(self, x):
x = self.features(x)
x = torch.flatten(x,1) #x.view(x.size(0), 256 * 6 * 6)
x = self.classifier(x)
return x
def neuralnet(num_classes,**kwargs):
model = convnet(**kwargs)
return model
So here my issue is: expected 4D input (got 2D input)
I'm quite sure that the error arises from the flatten command, however I don't really understand why as the classifier has fully dense connections. If someone knows where I'm going wrong, that would be very helpful!
Thank you
After flattening, the input to the classifier has 2 dimensions (size: [batch_size, 576]), therefore the output of the first linear layer will also have 2 dimensions (size: [batch_size, 128]). That output is then passed to nn.BatchNorm2d, which requires its input to have 4 dimensions (size: [batch_size, channels, height, width]).
If you want to use batch norm on a 2D input, you need to use nn.BatchNorm1d, which accepts either a 3D input (size: [batch_size, channels, length]) or a 2D input (size: [batch_size, length]).
self.classifier = nn.Sequential(
nn.Linear(576, 128),
nn.BatchNorm1d(128),
nn.ReLU(inplace=True),
nn.Linear(128, 64),
nn.ReLU(inplace=True),
nn.BatchNorm1d(64),
nn.Linear(64,num_classes),
nn.Softmax(),
)