RuntimeError: Given groups=3, weight of size 12 64 3 768, expected input[32, 12, 30, 768] to have 192 channels, but got 12 channels instead - python-3.x

I started working with Pytorch recently so my understanding of it isn't quite strong. I previously had a 1 layer CNN but wanted to extend it to 2 layers, but the input and output channels have been throwing errors I can seem to decipher. Why does it expect 192 channels? Can someone give me a pointer to help me understand this better? I have seen several related problems on here, but I don't understand those solutions either.
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertConfig, BertModel, BertTokenizer
import math
from transformers import AdamW, get_linear_schedule_with_warmup
def pad_sents(sents, pad_token): # Pad list of sentences according to the longest sentence in the batch.
sents_padded = []
max_len = max(len(s) for s in sents)
for s in sents:
padded = [pad_token] * max_len
padded[:len(s)] = s
sents_padded.append(padded)
return sents_padded
def sents_to_tensor(tokenizer, sents, device):
tokens_list = [tokenizer.tokenize(str(sent)) for sent in sents]
sents_lengths = [len(tokens) for tokens in tokens_list]
tokens_list_padded = pad_sents(tokens_list, '[PAD]')
sents_lengths = torch.tensor(sents_lengths, device=device)
masks = []
for tokens in tokens_list_padded:
mask = [0 if token == '[PAD]' else 1 for token in tokens]
masks.append(mask)
masks_tensor = torch.tensor(masks, dtype=torch.long, device=device)
tokens_id_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list_padded]
sents_tensor = torch.tensor(tokens_id_list, dtype=torch.long, device=device)
return sents_tensor, masks_tensor, sents_lengths
class ConvModel(nn.Module):
def __init__(self, device, dropout_rate, n_class, out_channel=16):
super(ConvModel, self).__init__()
self.bert_config = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
self.dropout_rate = dropout_rate
self.n_class = n_class
self.out_channel = out_channel
self.bert = BertModel.from_pretrained('bert-base-uncased', config=self.bert_config)
self.out_channels = self.bert.config.num_hidden_layers * self.out_channel
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', config=self.bert_config)
self.conv = nn.Conv2d(in_channels=self.bert.config.num_hidden_layers,
out_channels=self.out_channels,
kernel_size=(3, self.bert.config.hidden_size),
groups=self.bert.config.num_hidden_layers)
self.conv1 = nn.Conv2d(in_channels=self.out_channels,
out_channels=48,
kernel_size=(3, self.bert.config.hidden_size),
groups=self.bert.config.num_hidden_layers)
self.hidden_to_softmax = nn.Linear(self.out_channels, self.n_class, bias=True)
self.dropout = nn.Dropout(p=self.dropout_rate)
self.device = device
def forward(self, sents):
sents_tensor, masks_tensor, sents_lengths = sents_to_tensor(self.tokenizer, sents, self.device)
encoded_layers = self.bert(input_ids=sents_tensor, attention_mask=masks_tensor)
hidden_encoded_layer = encoded_layers[2]
hidden_encoded_layer = hidden_encoded_layer[0]
hidden_encoded_layer = torch.unsqueeze(hidden_encoded_layer, dim=1)
hidden_encoded_layer = hidden_encoded_layer.repeat(1, 12, 1, 1)
conv_out = self.conv(hidden_encoded_layer) # (batch_size, channel_out, some_length, 1)
conv_out = self.conv1(conv_out)
conv_out = torch.squeeze(conv_out, dim=3) # (batch_size, channel_out, some_length)
conv_out, _ = torch.max(conv_out, dim=2) # (batch_size, channel_out)
pre_softmax = self.hidden_to_softmax(conv_out)
return pre_softmax
def batch_iter(data, batch_size, shuffle=False, bert=None):
batch_num = math.ceil(data.shape[0] / batch_size)
index_array = list(range(data.shape[0]))
if shuffle:
data = data.sample(frac=1)
for i in range(batch_num):
indices = index_array[i * batch_size: (i + 1) * batch_size]
examples = data.iloc[indices]
sents = list(examples.train_BERT_tweet)
targets = list(examples.train_label.values)
yield sents, targets # list[list[str]] if not bert else list[str], list[int]
def train():
label_name = ['Yes', 'Maybe', 'No']
device = torch.device("cpu")
df_train = pd.read_csv('trainn.csv') # , index_col=0)
train_label = dict(df_train.train_label.value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max / train_label[i] for i in range(len(train_label))], device=device)
model = ConvModel(device=device, dropout_rate=0.2, n_class=len(label_name))
optimizer = AdamW(model.parameters(), lr=1e-3, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=1000) # changed the last 2 arguments to old ones
model = model.to(device)
model.train()
cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean')
train_batch_size = 16
for epoch in range(1):
for sents, targets in batch_iter(df_train, batch_size=train_batch_size, shuffle=True): # for each epoch
optimizer.zero_grad()
pre_softmax = model(sents)
loss = cn_loss(pre_softmax, torch.tensor(targets, dtype=torch.long, device=device))
loss.backward()
optimizer.step()
scheduler.step()
TrainingModel = train()
Here's a snippet of data https://github.com/Kosisochi/DataSnippet

It seems that the original version of the code you had in this question behaved differently. The final version of the code you have here gives me a different error from what you posted, more specifically - this:
RuntimeError: Calculated padded input size per channel: (20 x 1). Kernel size: (3 x 768). Kernel size can't be greater than actual input size
I apologize if I misunderstood the situation, but it seems to me that your understanding of what exactly nn.Conv2d layer does is not 100% clear and that is the main source of your struggle. I interpret the part "detailed explanation on 2 layer CNN in Pytorch" you requested as an ask to explain in detail on how that layer works and I hope that after this is done there will be no problem applying it 1 time, 2 times or more.
You can find all the documentation about the layer here, but let me give you a recap which hopefully will help to understand more the errors you're getting.
First of all nn.Conv2d inputs are 4-d tensors of the shape (BatchSize, ChannelsIn, Height, Width) and outputs are 4-d tensors of the shape (BatchSize, ChannelsOut, HeightOut, WidthOut). The simplest way to think about nn.Conv2d is of something applied to 2d images with pixel grid of size Height x Width and having ChannelsIn different colors or features per pixel. Even if your inputs have nothing to do with actual images the behavior of the layer is still the same. Simplest situation is when the nn.Conv2d is not using padding (as in your code). In that case the kernel_size=(kernel_height, kernel_width) argument specifies the rectangle which you can imagine sweeping through Height x Width rectangle of your inputs and producing one pixel for each valid position. Without padding the coordinate of the rectangle's point can be any pair of indicies (x, y) with x between 0 and Height - kernel_height and y between 0 and Width - kernel_width. Thus the output will look like a 2d image of size (Height - kernel_height + 1) x (Width - kernel_width + 1) and will have as many output channels as specified to nn.Conv2d constructor, so the output tensor will be of shape (BatchSize, ChannelsOut, Height - kernel_height + 1, Width - kernel_width + 1).
The parameter groups is not affecting how shapes are changed by the layer - it is only controlling which input channels are used as inputs for the output channels (groups=1 means that every input channel is used as input for every output channel, otherwise input and output channels are divided into corresponding number of groups and only input channels from group i are used as inputs for the output channels from group i).
Now in your current version of the code you have BatchSize = 16 and the output of pre-trained model is (BatchSize, DynamicSize, 768) with DynamicSize depending on the input, e.g. 22. You then introduce additional dimension as axis 1 with unsqueeze and repeat the values along that dimension transforming the tensor of shape (16, 22, 768) into (16, 12, 22, 768). Effectively you are using the output of the pre-trained model as 12-channel (with each channel having same values as others) 2-d images here of size (22, 768), where 22 is not fixed (depends on the batch). Then you apply a nn.Conv2d with kernel size (3, 768) - which means that there is no "wiggle room" for width and output 2-d images will be of size (20, 1) and since your layer has 192 channels final size of the output of first convolution layer has shape (16, 192, 20, 1). Then you try to apply second layer of convolution on top of that with kernel size (3, 768) again, but since your 2-d "image" is now just (20 x 1) there is no valid position to fit (3, 768) kernel rectangle inside a rectangle (20 x 1) which leads to the error message Kernel size can't be greater than actual input size.
Hope this explanation helps. Now to the choices you have to avoid the issue:
(a) is to add padding in such a way that the size of the output is not changing comparing to input (I won't go into details here,
because I don't think this is what you need)
(b) Use smaller kernel on both first and/or second convolutions (e.g. if you don't change first convolution the only valid width for
the second kernel would be 1).
(c) Looking at what you're trying to do my guess is that you actually don't want to use 2d convolution, you want 1d convolution (on the sequence) with every position described by 768 values. When you're using one convolution layer with 768 width kernel (and same 768 width input) you're effectively doing exactly same thing as 1d convolution with 768 input channels, but then if you try to apply second one you have a problem. You can specify kernel width as 1 for the next layer(s) and that will work for you, but a more correct way would be to transpose pre-trained model's output tensor by switching the last dimensions - getting shape (16, 768, DynamicSize) from (16, DynamicSize, 768) and then apply nn.Conv1d layer with 768 input channels and arbitrary ChannelsOut as output channels and 1d kernel_size=3 (meaning you look at 3 consecutive elements of the sequence for convolution). If you do that than without padding input shape of (16, 768, DynamicSize) will become (16, ChannelsOut, DynamicSize-2), and after you apply second Conv1d with e.g. the same settings as first one you'll get a tensor of shape (16, ChannelsOut, DynamicSize-4), etc. (each time the 1d length will shrink by kernel_size-1). You can always change number of channels/kernel_size for each subsequent convolution layer too.

Related

How to do a weighted pooling in Mxnet?

I want to do a 2d convolutional operation that uses same 1x2x4 weight on every channel.
(Note: the input height & width are bigger than our kernel, so I can't just use a dot product.)
How can I do this is mxnet?
I tried to use the same instance of a signle 2d conv layer by concatenating it on every channel, but it is incredibly slow.
def Concat(*args, axis=1, **kwargs):
net = nn.HybridConcatenate(axis=axis,**kwargs)
net.add(*args)
return net
def Seq(*args):
net = nn.HybridSequential()
net.add(*args)
return net
class Trim_D1(nn.HybridBlock):
def __init__(self, from_, to, **kwargs):
super(Trim_D1, self).__init__(**kwargs)
self.from_ = from_
self.to = to
def forward(self, x):
return x[:,self.from_:self.to]
PooPool = nn.Conv2D(kernel_size=(2,4), strides=(2, 4), channels=1, activation=None, use_bias=False, weight_initializer=mx.init.Constant(1/8))
conc = ()
for i in range(40):
conc += Seq(
Trim_D1(i,i+1),
PooPool
),
WeightedPool= Concat(*conc)
Ideally I would also want my kernel weights to sum up to 1 in order to resemble the weighted average pooling.
Edit: I think I know how to do this. I'm going to edit Conv2D and _Conv source codes so that instead of creating weights of CxHxW dimension it creates a weight of 1xHxW dimension and uses a broadcasting during the convolutional operation. In order for weights to sum up to 1, additionally a softmax operation has to be applied.
Ok, apparently the weights are of in_channels x out_channels x H x W dimensions and broadcasting is not allowed during the convolutional operation. We could fix out_channels to 1 by using the num_groups same as the output channels, as for input channels, we can simply broadcast the same weight n number of times.
In _Conv.__init__ during initialization I discarded the first two dimensions so our kernel is only H x W now:
self.weight = Parameter('weight', shape=wshapes[1][2:],
init=weight_initializer,
allow_deferred_init=True)
In _Conv.hybrid_forward I am flattening our weight to 1D in order to perform softmax and then restore to the original 2D shape. Then I expand first two dimensions and repeat the first dimension as mentioned above:
orig_shape = weight.shape
act = getattr(F, self._op_name)(x, mx.nd.softmax(weight.reshape(-1)).reshape(orig_shape)[None,None,:].repeat(self._kwargs['num_group'],axis=0), name='fwd', **self._kwargs)

Questions about programming a cnn with PyTorch

I'm pretty new at programming cnn so I'm a little bit lost. I'm trying to do this part of the code, where they ask me to implement a fully-connected network to classify the digits. It should contain 1 hidden layer with 20 units. I should use ReLU activation function on the hidden layer.
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.fc1 = ...
self.fc2 = nn.Sequential(
nn.Linear(500,10),
nn.Softmax(dim = 1)
)
def forward(self, x):
x = x.view(x.size(0),-1)
x = self.fc1(x)
x = self.fc2(x)
return x
The dots are the part to fill, I think about this line:
self.fc1 = nn.Linear(20, 500)
But I don't know if it's correct. Could someone help me please? And I don't understand at all what the function Softmax do... so if someone knows it please.
Thank you so much!!
Pd. This is the code to load the data:
batch_size = 64
trainset = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=1)
testset = datasets.MNIST('./data', train=False, download=True, transform=transforms.ToTensor())
test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=1)
From the code given for the model, it can be seen that the hidden layer has 500 units. So I am assuming you meant 20 units for input. With this assumption, the code must be:
self.fc1 = nn.Sequential(
nn.Linear(20, 500),
nn.ReLU()
)
Coming to the next part of your question, given that you are working with MNIST dataset and you have the softmax function, I am assuming you are trying to predict the number present in the images.
Your neural network performs various multiplication and addition operations in each layer and finally, you end up with 10 numbers in the output layer. Now, you have to make sense of these 10 numbers to decide which of the 10 digits is given in the image.
One way to do this would be to select the unit which has the maximum value. For example if the 10th unit has the maximum value among all units, then we conclude that the digit is '9'. If the 2nd unit has the maximum value, then we conclude that the digit is '1'.
This is fine but a better way would be to convert the values of each of the units to probability that the corresponding digit is contained in the image and then we choose the digit having highest probability. This has certain mathematical advantages which helps us in defining a better loss function.
Softmax is what helps us to convert the values to probabilities. On applying softmax, all the values lie in the range (0, 1) and they sum up to 1.
If you are interested in deeplearning and the math behind it, I would suggest you to checkout Andrew NG's course on deeplearning.
You did not mention the shape of your data so I'll be assuming the expected shape returned by datasets.MNIST.
Data shape: torch.Size([64, 1, 28, 28])
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(1*28*28, 20),
nn.ReLU())
self.fc2 = nn.Sequential(
nn.Linear(500,10),
nn.Softmax(dim = 1))
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.fc1(x)
x = self.fc2(x)
return x
The first argument of nn.Linear is the size of input feature while the second is the number of units.
For self.fc1, the size of the input feature is the multiplication of your data shape except the batch size, which is 1 * 28 * 28. And as per your post the second argument should be 20 (20 units).
The shape of the output from self.fc1 (which is also the input to self.fc2) will then be (batch size, 20).
For self.fc2, the size of the input feature will be 20 while the number of units (which is also the number of digits) will be 10.

how to build a multidimensional autoencoder with pytorch

I followed this great answer for sequence autoencoder,
LSTM autoencoder always returns the average of the input sequence.
but I met some problem when I try to change the code:
question one:
Your explanation is so professional, but the problem is a little bit different from mine, I attached some code I changed from your example. My input features are 2 dimensional, and my output is same with the input.
for example:
input_x = torch.Tensor([[0.0,0.0], [0.1,0.1], [0.2,0.2], [0.3,0.3], [0.4,0.4]])
output_y = torch.Tensor([[0.0,0.0], [0.1,0.1], [0.2,0.2], [0.3,0.3], [0.4,0.4]])
the input_x and output_y are same, 5-timesteps, 2-dimensional feature.
import torch
import torch.nn as nn
import torch.optim as optim
class LSTM(nn.Module):
def __init__(self, input_dim, latent_dim, num_layers):
super(LSTM, self).__init__()
self.input_dim = input_dim
self.latent_dim = latent_dim
self.num_layers = num_layers
self.encoder = nn.LSTM(self.input_dim, self.latent_dim, self.num_layers)
# I changed here, to 40 dimesion, I think there is some problem
# self.decoder = nn.LSTM(self.latent_dim, self.input_dim, self.num_layers)
self.decoder = nn.LSTM(40, self.input_dim, self.num_layers)
def forward(self, input):
# Encode
_, (last_hidden, _) = self.encoder(input)
# It is way more general that way
encoded = last_hidden.repeat(input.shape)
# Decode
y, _ = self.decoder(encoded)
return torch.squeeze(y)
model = LSTM(input_dim=2, latent_dim=20, num_layers=1)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters())
y = torch.Tensor([[0.0,0.0], [0.1,0.1], [0.2,0.2], [0.3,0.3], [0.4,0.4]])
x = y.view(len(y), -1, 2) # I changed here
while True:
y_pred = model(x)
optimizer.zero_grad()
loss = loss_function(y_pred, y)
loss.backward()
optimizer.step()
print(y_pred)
The above code can learn very well, can you help review the code and give some instructions.
When I input 2 examples as the input to the model, the model cannot work:
for example, change the code:
y = torch.Tensor([[0.0,0.0], [0.1,0.1], [0.2,0.2], [0.3,0.3], [0.4,0.4]])
to:
y = torch.Tensor([[[0.0,0.0],[0.5,0.5]], [[0.1,0.1], [0.6,0.6]], [[0.2,0.2],[0.7,0.7]], [[0.3,0.3],[0.8,0.8]], [[0.4,0.4],[0.9,0.9]]])
When I compute the loss function, it complain some errors? can anyone help have a look
question two:
my training samples are with different length:
for example:
x1 = [[0.0,0.0], [0.1,0.1], [0.2,0.2], [0.3,0.3], [0.4,0.4]] #with 5 timesteps
x2 = [[0.5,0.5], [0.6,0.6], [0.7,0.7]] #with only 3 timesteps
How can I input these two training sample into the model at the same time for a batch training.
Recurrent N-dimensional autoencoder
First of all, LSTMs work on 1D samples, yours are 2D as it's usually used for words encoded with a single vector.
No worries though, one can flatten this 2D sample to 1D, example for your case would be:
import torch
var = torch.randn(10, 32, 100, 100)
var.reshape((10, 32, -1)) # shape: [10, 32, 100 * 100]
Please notice it's really not general, what if you were to have 3D input? Snippet belows generalizes this notion to any dimension of your samples, provided the preceding dimensions are batch_size and seq_len:
import torch
input_size = 2
var = torch.randn(10, 32, 100, 100, 35)
var.reshape(var.shape[:-input_size] + (-1,)) # shape: [10, 32, 100 * 100 * 35]
Finally, you can employ it inside neural network as follows. Look at forward method especially and constructor arguments:
import torch
class LSTM(nn.Module):
# input_dim has to be size after flattening
# For 20x20 single input it would be 400
def __init__(
self,
input_dimensionality: int,
input_dim: int,
latent_dim: int,
num_layers: int,
):
super(LSTM, self).__init__()
self.input_dimensionality: int = input_dimensionality
self.input_dim: int = input_dim # It is 1d, remember
self.latent_dim: int = latent_dim
self.num_layers: int = num_layers
self.encoder = torch.nn.LSTM(self.input_dim, self.latent_dim, self.num_layers)
# You can have any latent dim you want, just output has to be exact same size as input
# In this case, only encoder and decoder, it has to be input_dim though
self.decoder = torch.nn.LSTM(self.latent_dim, self.input_dim, self.num_layers)
def forward(self, input):
# Save original size first:
original_shape = input.shape
# Flatten 2d (or 3d or however many you specified in constructor)
input = input.reshape(input.shape[: -self.input_dimensionality] + (-1,))
# Rest goes as in my previous answer
_, (last_hidden, _) = self.encoder(input)
encoded = last_hidden.repeat(input.shape)
y, _ = self.decoder(encoded)
# You have to reshape output to what the original was
reshaped_y = y.reshape(original_shape)
return torch.squeeze(reshaped_y)
Remember you have to reshape your output in this case. It should work for any dimensions.
Batching
When it comes to batching and different length of sequences it is a little more complicated.
You have to pad each sequence in batch before pushing it through network. Usually, values with which you pad are zeros, you may configure it inside LSTM though.
You may check this link for an example. You will have to use functions like torch.nn.pack_padded_sequence and others to make it work, you may check this answer.
Oh, since PyTorch 1.1 you don't have to sort your sequences by length in order to pack them. But when it comes to this topic, grab some tutorials, should make things clearer.
Lastly: Please, separate your questions. If you perform the autoencoding with single example, move on to batching and if you have issues there, please post a new question on StackOverflow, thanks.

understanding output shape of keras Conv2DTranspose

I am having a hard time understanding the output shape of keras.layers.Conv2DTranspose
Here is the prototype:
keras.layers.Conv2DTranspose(
filters,
kernel_size,
strides=(1, 1),
padding='valid',
output_padding=None,
data_format=None,
dilation_rate=(1, 1),
activation=None,
use_bias=True,
kernel_initializer='glorot_uniform',
bias_initializer='zeros',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None
)
In the documentation (https://keras.io/layers/convolutional/), I read:
If output_padding is set to None (default), the output shape is inferred.
In the code (https://github.com/keras-team/keras/blob/master/keras/layers/convolutional.py), I read:
out_height = conv_utils.deconv_length(height,
stride_h, kernel_h,
self.padding,
out_pad_h,
self.dilation_rate[0])
out_width = conv_utils.deconv_length(width,
stride_w, kernel_w,
self.padding,
out_pad_w,
self.dilation_rate[1])
if self.data_format == 'channels_first':
output_shape = (batch_size, self.filters, out_height, out_width)
else:
output_shape = (batch_size, out_height, out_width, self.filters)
and (https://github.com/keras-team/keras/blob/master/keras/utils/conv_utils.py):
def deconv_length(dim_size, stride_size, kernel_size, padding, output_padding, dilation=1):
"""Determines output length of a transposed convolution given input length.
# Arguments
dim_size: Integer, the input length.
stride_size: Integer, the stride along the dimension of `dim_size`.
kernel_size: Integer, the kernel size along the dimension of `dim_size`.
padding: One of `"same"`, `"valid"`, `"full"`.
output_padding: Integer, amount of padding along the output dimension, can be set to `None` in which case the output length is inferred.
dilation: dilation rate, integer.
# Returns
The output length (integer).
"""
assert padding in {'same', 'valid', 'full'}
if dim_size is None:
return None
# Get the dilated kernel size
kernel_size = kernel_size + (kernel_size - 1) * (dilation - 1)
# Infer length if output padding is None, else compute the exact length
if output_padding is None:
if padding == 'valid':
dim_size = dim_size * stride_size + max(kernel_size - stride_size, 0)
elif padding == 'full':
dim_size = dim_size * stride_size - (stride_size + kernel_size - 2)
elif padding == 'same':
dim_size = dim_size * stride_size
else:
if padding == 'same':
pad = kernel_size // 2
elif padding == 'valid':
pad = 0
elif padding == 'full':
pad = kernel_size - 1
dim_size = ((dim_size - 1) * stride_size + kernel_size - 2 * pad + output_padding)
return dim_size
I understand that Conv2DTranspose is kind of a Conv2D, but reversed.
Since applying a Conv2D with kernel_size = (3, 3), strides = (10, 10) and padding = "same" to a 200x200 image will output a 20x20 image,
I assume that applying a Conv2DTranspose with kernel_size = (3, 3), strides = (10, 10) and padding = "same" to a 20x20 image will output a 200x200 image.
Also, applying a Conv2D with kernel_size = (3, 3), strides = (10, 10) and padding = "same" to a 195x195 image will also output a 20x20 image.
So, I understand that there is kind of an ambiguity on the output shape when applying a Conv2DTranspose with kernel_size = (3, 3), strides = (10, 10) and padding = "same" (user might want output to be 195x195, or 200x200, or many other compatible shapes).
I assume that "the output shape is inferred." means that a default output shape is computed according to the parameters of the layer, and I assume that there is a mechanism to specify an output shape differnet from the default one, if necessary.
This said, I do not really understand
the meaning of the "output_padding" parameter
the interactions between parameters "padding" and "output_padding"
the various formulas in the function keras.conv_utils.deconv_length
Could someone explain this?
Many thanks,
Julien
I may have found a (partial) answer.
I found it in the Pytorch documentation, which appears to be much clearer than the Keras documentation on this topic.
When applying Conv2D with a stride greater than 1 to images which dimensions are close, we get output images with the same dimensions.
For instance, when applied a Conv2D with kernel size of 3x3, stride of 7x7 and padding "same", the following image dimensions
22x22, 23x23, ..., 28x28, 22x28, 28x22, 27x24, etc. (7x7 = 49
combinations)
will ALL yield an output dimension of 4x4.
That is because output_dimension = ceiling(input_dimension / stride).
As a consequence, when applying a Conv2DTranspose with kernel size of 3x3, stride of 7x7 and padding "same", there is an ambiguity about the output dimension.
Any of the 49 possible output dimensions would be correct.
The parameter output_padding is a way to resolve the ambiguity by choosing explicitly the output dimension.
In my example, the minimum output size is 22x22, and output_padding provides a number of lines (between 0 and 6) to add at the bottom of the output image and a number of columns (between 0 and 6) to add at the right of the output image.
So I can get output_dimensions = 24x25 if I use outout_padding = (2, 3)
What I still do not understand, however, is the logic that keras uses to choose a certain output image dimension when output_padding is not specified (when it 'infers" the output shape)
A few pointers:
https://pytorch.org/docs/stable/nn.html#torch.nn.ConvTranspose2d
https://discuss.pytorch.org/t/the-output-size-of-convtranspose2d-differs-from-the-expected-output-size/1876/5
https://discuss.pytorch.org/t/question-about-the-output-padding-in-nn-convtrasnpose2d/19740
https://discuss.pytorch.org/t/what-does-output-padding-exactly-do-in-convtranspose2d/2688
So to answer my own questions:
the meaning of the "output_padding" parameter: see above
the interactions between parameters "padding" and "output_padding": these parameters are independant
the various formulas in the function keras.conv_utils.deconv_length
For now, I do not understand the part when output_padding is None;
I ignore the case when padding == 'full' (not supported by Conv2DTranspose);
The formula for padding == 'valid' seems correct (can be computed by reversing the formula of Conv2D)
The formula for padding == 'same' seems incorrect to me, in case kernel_size is even. (As a matter of fact, keras crashes when trying to build a Conv2DTranspose layer with input_dimension = 5x5, kernel_size = 2x2, stride = 7x7 and padding = 'same'. It appears to me that there is a bug in keras, I will start another thread for this topic...)
Outpadding in Conv2DTranspose is also what I am concerned about when designing an autoencoder.
Assume stride is always 1. Along the encoder path, for each convolution layer, I chose padding='valid', which means that if my input image is HXW, and the filter is sized mXn, the output of the layer will be (H-(m-1))X(W-(n-1)).
In the corresponding Con2DTranspose layer along the decoder path, if I use Theano, in order to resume the input size of its corresponding Con2D, I have to chose padding='full', and out_padding = None or 0 (no difference), which implies the input size will be expanded by [m-1, n-1] around it, that is, (m-1)/2 for top and bottom, and (n-1)/2 for left and right.
If I use tensorflow, I will have to choose padding = 'same', and out_padding = 2*((filter_size-1)//2), I think that is Keras' intended behaviour.
If stride is not 1, then you will have to calculate carefully how many output paddings are to be added.
In Conv2D out_size = floor(in_size+2*padding_size-filter_size)/stride+1)
If we choose padding = 'same', Keras will automatically set padding = (filter_size-1)/2; whilst if we choose 'valid', padding_size will be set 0, which is the convention of any N-D convolutions.
Conversely, in Con2DTranspose out_size = (in_size-1)*stride+filter_size-2*padding_size
where padding_size refers to how many pixels will actually be padded caused by 'padding' option and out_padding together. Based upon the discussion above, there is no 'full' option on tensorflow, we will have to use out_padding to resume the input size of its corresponding Con2D.
Could you try and see if it works properly and let me know, please?
So in summary, I think out_padding is used for facilitating different backends.
When output_padding=None, Keras uses the deconv_output_length method to compute the output length, which sets it to:
if padding == 'valid':
length = input_length * stride + max(filter_size - stride, 0)
elif padding == 'same':
length = input_length * stride
Now in the documentation it says that if output_padding is set, the output length will be
((input_length - 1) * stride + filter_size - 2 * padding + output_padding
So using this we can figure out what the default output_padding is.
In the padding='valid' case, padding = 0 in the above, so solving for output_padding:
output_padding = max(stride - filter_size, 0)
padding='valid'
In this case, padding = 0 in the above, so solving for output_padding:
output_padding = max(stride - filter_size, 0)
and one can check that setting this results in the same as setting it to None
padding = 'same'
This case is much more mysterious, and in fact it seems to be impossible to get the same as output_padding=None by setting it to any integer. For example with strides=2 and kernel_size=2, for an output_padding larger than 1, it gives a warning that the stride must be larger than the output padding. For anything smaller than 1 it gives a warning that the size of out_backprop doesn't match computed. So the only value that works is 1, but this results in a different output shape from None.
In fact it is not implemented by setting output_padding to some default value, it is only used to compute the output shape, which then is used in the convolution method.

3D CNN parameter calculation

I have a CNN code that was written using tensorflow library:
x_img = tf.placeholder(tf.float32)
y_label = tf.placeholder(tf.float32)
def convnet_3d(x_img, W):
conv_3d_layer = tf.nn.conv3d(x_img, W, strides=[1,1,1,1,1], padding='VALID')
return conv_3d_layer
def maxpool_3d(x_img):
maxpool_3d_layer = tf.nn.max_pool3d(x_img, ksize=[1,2,2,2,1], strides=[1,2,2,2,1], padding='VALID')
return maxpool_3d_layer
def convolutional_neural_network(x_img):
weights = {'W_conv1_layer':tf.Variable(tf.random_normal([3,3,3,1,32])),
'W_conv2_layer':tf.Variable(tf.random_normal([3,3,3,32,64])),
'W_fc_layer':tf.Variable(tf.random_normal([409600,1024])),
'W_out_layer':tf.Variable(tf.random_normal([1024, num_classes]))}
biases = {'b_conv1_layer':tf.Variable(tf.random_normal([32])),
'b_conv2_layer':tf.Variable(tf.random_normal([64])),
'b_fc_layer':tf.Variable(tf.random_normal([1024])),
'b_out_layer':tf.Variable(tf.random_normal([num_classes]))}
x_img = tf.reshape(x_img, shape=[-1, img_x, img_y, img_z, 1])
conv1_layer = tf.nn.relu(convnet_3d(x_img, weights['W_conv1_layer']) + biases['b_conv1_layer'])
conv1_layer = maxpool_3d(conv1_layer)
conv2_layer = tf.nn.relu(convnet_3d(conv1_layer, weights['W_conv2_layer']) + biases['b_conv2_layer'])
conv2_layer = maxpool_3d(conv2_layer)
fc_layer = tf.reshape(conv2_layer,[-1, 409600])
fc_layer = tf.nn.relu(tf.matmul(fc_layer, weights['W_fc_layer'])+biases['b_fc_layer'])
fc_layer = tf.nn.dropout(fc_layer, keep_rate)
output_layer = tf.matmul(fc_layer, weights['W_out_layer'])+biases['b_out_layer']
return output_layer
my input image x_img is 25x25x25(3d image), I have some questions about the code:
1- is [3,3,3,1,32] in 'W_conv1_layer' means [width x height x depth x channel x number of filters]?
2- in 'W_conv2_layer' weights are [3,3,3,32,64], why the output is 64? I know that 3x3x3 is filter size and 32 is input come from first layer.
3- in 'W_fc_layer' weights are [409600,1024], 1024 is number of nodes in FC layer, but where this magic number '409600' come from?
4- before the image get into the conv layers why we need to reshape the image
x_img = tf.reshape(x_img, shape=[-1, img_x, img_y, img_z, 1])
All the answers can be found in the official doc of conv3d.
The weights should be [filter_depth, filter_height, filter_width, in_channels, out_channels]
The numbers 32 and 64 are chosen because it works simply they are just hyperparameters
409600 comes from reshaping the output of maxpool3d (it is probably a mistake the real size should be 4096 see comments)
Because tensorflow expects certain layouts for its input
Your should try implementing a simple convnet on images before moving to more complicated stuff.

Resources