Why is there extra paramaters in my simple CNN? - pytorch

Here is my problem : when I look for the number of parameters in my first block, I see 36928 parameters (which is what I expect). But when I used this block to construct a model in an other class nn.Module, there are 1792 extra parameters and I can't figure out where they come from.
I put some code below to illustrate.
class Conv2dBlock(torch.nn.Module):
def __init__(self, in_filters, out_filters, kernel_size=3):
super(Conv2dBlock, self).__init__()
self.conv2d_seq = torch.nn.Sequential()
for k in range(2):
self.conv2d_seq.append(torch.nn.Conv2d(in_channels=in_filters, out_channels=out_filters, kernel_size=kernel_size, padding='same'))
self.conv2d_seq.append(torch.nn.ReLU())
in_filters = out_filters
def forward(self, input):
out = self.conv2d_seq(input)
return out
En then, I use this block in an other nn.Module :
class EncoderBlock(torch.nn.Module):
def __init__(self):
super(EncoderBlock, self).__init__()
self.conv2d = Conv2dBlock(3, 64)
self.maxpool = torch.nn.MaxPool2d(kernel_size=2)
def forward(self, input):
x = self.conv2d(input)
p = self.maxpool(x)
out = torch.nn.functional.dropout(p, 0.3)
return x, out
And finaly :
class UNet_model(torch.nn.Module):
def __init__(self):
super(UNet_model, self).__init__()
self.encoder_block1 = EncoderBlock()
def forward(self, input):
p1 = self.encoder_block1(input)
# I removed useless code
return p1
model = UNet_model()
summary(model, (3,128,128))
This last class constructs a model with 38 720 parameters, instead of 36 928. It seems there is an extra convolutional layer ((3,64, (3,3)) = 1792 params) applied twice to the input... I don't understand.
Can somebody take a look ?
Thanks !

First of all, torch.nn.Sequential() doesn't support append method, it should be changed to add_module, like this:
for k in range(2):
self.conv2d_seq.add_module(f"conv_{k}",torch.nn.Conv2d(in_channels=in_filters, out_channels=out_filters, kernel_size=kernel_size, padding='same'))
self.conv2d_seq.add_module(f"relu_{k}",torch.nn.ReLU())
in_filters = out_filters
Second, if you run torchinfo summary on the initial block you will see:
==========================================================================================
Layer (type:depth-idx) Output Shape Param #
==========================================================================================
Conv2dBlock [1, 64, 64, 64] --
├─Sequential: 1-1 [1, 64, 64, 64] --
│ └─Conv2d: 2-1 [1, 64, 64, 64] 1,792
│ └─ReLU: 2-2 [1, 64, 64, 64] --
│ └─Conv2d: 2-3 [1, 64, 64, 64] 36,928
│ └─ReLU: 2-4 [1, 64, 64, 64] --
==========================================================================================
Total params: 38,720
Trainable params: 38,720
Non-trainable params: 0
Total mult-adds (M): 158.60
==========================================================================================
Input size (MB): 0.05
Forward/backward pass size (MB): 4.19
Params size (MB): 0.15
Estimated Total Size (MB): 4.40
==========================================================================================
So you can see that you have two conv layers (1,792 + 36,928) as you specified 2 layers in your for loop: for k in range(2).

Related

BATCHNORM toward an axis in pytorch [duplicate]

I've a sample tiny CNN implemented in both Keras and PyTorch. When I print summary of both the networks, the total number of trainable parameters are same but total number of parameters and number of parameters for Batch Normalization don't match.
Here is the CNN implementation in Keras:
inputs = Input(shape = (64, 64, 1)). # Channel Last: (NHWC)
model = Conv2D(filters=32, kernel_size=(3, 3), padding='SAME', activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 1))(inputs)
model = BatchNormalization(momentum=0.15, axis=-1)(model)
model = Flatten()(model)
dense = Dense(100, activation = "relu")(model)
head_root = Dense(10, activation = 'softmax')(dense)
And the summary printed for above model is:
Model: "model_8"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_9 (InputLayer) (None, 64, 64, 1) 0
_________________________________________________________________
conv2d_10 (Conv2D) (None, 64, 64, 32) 320
_________________________________________________________________
batch_normalization_2 (Batch (None, 64, 64, 32) 128
_________________________________________________________________
flatten_3 (Flatten) (None, 131072) 0
_________________________________________________________________
dense_11 (Dense) (None, 100) 13107300
_________________________________________________________________
dense_12 (Dense) (None, 10) 1010
=================================================================
Total params: 13,108,758
Trainable params: 13,108,694
Non-trainable params: 64
_________________________________________________________________
Here's the implementation of the same model architecture in PyTorch:
# Image format: Channel first (NCHW) in PyTorch
class CustomModel(nn.Module):
def __init__(self):
super(CustomModel, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), padding=1),
nn.ReLU(True),
nn.BatchNorm2d(num_features=32),
)
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(in_features=131072, out_features=100)
self.fc2 = nn.Linear(in_features=100, out_features=10)
def forward(self, x):
output = self.layer1(x)
output = self.flatten(output)
output = self.fc1(output)
output = self.fc2(output)
return output
And following is the output of summary of the above model:
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 32, 64, 64] 320
ReLU-2 [-1, 32, 64, 64] 0
BatchNorm2d-3 [-1, 32, 64, 64] 64
Flatten-4 [-1, 131072] 0
Linear-5 [-1, 100] 13,107,300
Linear-6 [-1, 10] 1,010
================================================================
Total params: 13,108,694
Trainable params: 13,108,694
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.02
Forward/backward pass size (MB): 4.00
Params size (MB): 50.01
Estimated Total Size (MB): 54.02
----------------------------------------------------------------
As you can see in above results, Batch Normalization in Keras has more number of parameters than PyTorch (2x to be exact). So what's the difference in above CNN architectures? If they are equivalent, then what am I missing here?
Keras treats as parameters (weights) many things that will be "saved/loaded" in the layer.
While both implementations naturally have the accumulated "mean" and "variance" of the batches, these values are not trainable with backpropagation.
Nevertheless, these values are updated every batch, and Keras treats them as non-trainable weights, while PyTorch simply hides them. The term "non-trainable" here means "not trainable by backpropagation", but doesn't mean the values are frozen.
In total they are 4 groups of "weights" for a BatchNormalization layer. Considering the selected axis (default = -1, size=32 for your layer)
scale (32) - trainable
offset (32) - trainable
accumulated means (32) - non-trainable, but updated every batch
accumulated std (32) - non-trainable, but updated every batch
The advantage of having it like this in Keras is that when you save the layer, you also save the mean and variance values the same way you save all other weights in the layer automatically. And when you load the layer, these weights are loaded together.

Given groups=1, weight of size [32, 3, 3, 3], expected input[1, 1, 32, 340] to have 3 channels, but got 1 channels instead

This is the question:
Before we define the model, we define the size of our alphabet. Our alphabet consists of lowercase English letters, and additionally a special character used for space between symbols or before and after the word. For the first part of this assignment, we don't need that extra character.
Our end goal is to learn to transcribe words of arbitrary length. However, first, we pre-train our simple convolutional neural net to recognize single characters. In order to be able to use the same model for one character and for entire words, we are going to design the model in a way that makes sure that the output size for one character (or when input image size is 32x18) is 1x27, and Kx27 whenever the input image is wider. K here will depend on particular architecture of the network, and is affected by strides, poolings, among other things. A little bit more formally, our model 𝑓𝜃 , for an input image 𝑥 gives output energies 𝑙=𝑓𝜃(𝑥) . If 𝑥∈ℝ32×18 , then 𝑙∈ℝ1×27 . If 𝑥∈ℝ32×100 for example, our model may output 𝑙∈ℝ10×27 , where 𝑙𝑖 corresponds to a particular window in 𝑥 , for example from 𝑥0,9𝑖 to 𝑥32,9𝑖+18 (again, this will depend on the particular architecture).
The code:
# constants for number of classes in total, and for the special extra character for empty space
ALPHABET_SIZE = 27, # Extra character for space inbetween
BETWEEN = 26
print(alphabet.shape) # RETURNS: torch.Size([32, 340])
My CNN Block:
from torch import nn
import torch.nn.functional as F
"""
Remember basics:
1. Bigger strides = less overlap
2. More filters = More features
Image shape = 32, 18
Alphabet shape = 32, 340
"""
class SimpleNet(torch.nn.Module):
def __init__(self):
super().__init__()
self.cnn_block = torch.nn.Sequential(
nn.Conv2d(3, 32, 3),
nn.BatchNorm2d(32),
nn.Conv2d(32, 32, 3),
nn.BatchNorm2d(32),
nn.Conv2d(32, 32, 3),
nn.BatchNorm2d(32),
nn.MaxPool2d(2),
nn.Conv2d(32, 64, 3),
nn.BatchNorm2d(64),
nn.Conv2d(64, 64, 3),
nn.BatchNorm2d(64),
nn.Conv2d(64, 64, 3),
nn.BatchNorm2d(64),
nn.MaxPool2d(2)
)
def forward(self, x):
x = self.cnn_block(x)
# after applying cnn_block, x.shape should be:
# batch_size, alphabet_size, 1, width
return x[:, :, 0, :].permute(0, 2, 1)
model = SimpleNet()
alphabet_energies = model(alphabet.view(1, 1, *alphabet.shape))
def plot_energies(ce):
fig=plt.figure(dpi=200)
ax = plt.axes()
im = ax.imshow(ce.cpu().T)
ax.set_xlabel('window locations →')
ax.set_ylabel('← classes')
ax.xaxis.set_label_position('top')
ax.set_xticks([])
ax.set_yticks([])
cax = fig.add_axes([ax.get_position().x1+0.01,ax.get_position().y0,0.02,ax.get_position().height])
plt.colorbar(im, cax=cax)
plot_energies(alphabet_energies[0].detach())
I get the error in the title at alphabet_energies = model(alphabet.view(1, 1, *alphabet.shape))
Any help would be appreciated.
You should begin to replace nn.Conv2d(3, 32, 3) to nn.Conv2d(1, 32, 3)
Your model begins with a conv2d from 3 channels to 32 but your input image has only 1 channel (greyscale image).

How to generate an onnx file with linear layers using Pytorch

I want to create a network on the basis of the vgg16 network, but adding linear layers (Gemm) just after the conv2d layers, for normalization purpose.
After that, I want to export the network in an ONNX file.
The first part seems to work: I took the Pytorch code for generating the vgg16 and modified it as follows
import torch.nn as nn
class VGG(nn.Module):
def __init__(self, features, num_classes=8, init_weights=True):
super(VGG, self).__init__()
self.features = features
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.Linear(4096, 4096), # New shift layer
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.Linear(4096, 4096), # New shift layer
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 8),
nn.Linear(8, 8), # New shift layer
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
def make_layers(cfg, batch_norm=False):
layers = []
in_channels = 3
n = 224
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
n = int(n / 2)
elif v == 'B':
layers += [nn.AdaptiveAvgPool2d(n)]
else:
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
linear = nn.Linear(n,n,True)
if batch_norm:
layers += [conv2d, linear, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, linear, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
cfg = {'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M','B'],
}
def vgg16(**kwargs):
"""VGG 16-layer model (configuration "D")
"""
model = VGG(make_layers(cfg['D']), **kwargs)
return model
But when I insert the weights and export to onnx, I see that my linear layers are not referred to as Gemm but as {Transpose + Matmult + Add}
The Transpose part is the weights matrix and the Add part is for the biases (which are all 0).
Am I wrong to think that it's possible to do this, or is there a way to get a real Gemm layer here or another way to do this normalization (which is simply multiply all outputs by a single value)?
The input data of nn.Linear here is a 4-D tensor, then torch will export it to {Transpose, MatMul, Add}. Only input is 2-D, the GEMM op will be exported.
You can have to look at the source code of Pytorch for more information.

Convert from Keras to Pytorch - conv2d

I am trying to convert the following Keras code into PyTorch.
tf.keras.Sequential([
Conv2D(128, 1, activation=tf.nn.relu),
Conv2D(self.channel_n, 1, activation=None),
])
When creating the model summary with self.channels=16 i get the following summary.
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (1, 3, 3, 128) 6272
_________________________________________________________________
conv2d_1 (Conv2D) (1, 3, 3, 16) 2064
=================================================================
Total params: 8,336
Trainable params: 8,336
Non-trainable params: 0
How would one convert?
I have attempted it as such:
import torch
from torch import nn
class CellCA(nn.Module):
def __init__(self, channels, dim=128):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(in_channels=channels,out_channels=dim, kernel_size=1),
nn.ReLU(),
nn.Conv2d(in_channels=dim, out_channels=channels, kernel_size=1),
)
def forward(self, x):
return self.net(x)
However, I get 4240 params
The attempt above is correct if you configure the initial channels in correcty (48 in this case).

PyTorch custom DataLoader dimension issues for CNN

I have written a custom Dataset and DataLoader for a PyTorch CNN project. Here is the relevant code for the dataset
class MyDataset(Dataset):
def __init__(self):
pass
def __len__(self):
return COUNT
def __getitem__(self, idx):
x, y = X[idx], Y[idx]
x = image_augment(x) # custom func to resize image to 32x32
return x, y
The shape of each training x is [4, 32, 32, 3].
And here is my Net code, taken directly from this PyTorch example.
class Net(nn.Module):
def __init__(self, nc):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, nc)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
When I try to train this net on my data from my DataLoader, I get the error statement Given groups=1, weight of size [6, 3, 5, 5], expected input[4, 32, 32, 3] to have 3 channels, but got 200 channels instead. It seems to me my issue is with the shape of my data coming from my DataLoader using x.view(4, 3, 32, 32), but then I got an error saying I couldn't use Conv2D on a ByteTensor. I'm a little lost here and would really appreciate any help. Thanks!
I got it eventually. Had to x = x.view(x.shape[0], 3, self.img_height, self.img_width).type('torch.FloatTensor'). for example. This would make that swap from [4, 32, 32, 3] to [4, 3, 32, 32].

Resources