Change input shape dimensions for ResNet model (pytorch) - pytorch

I want to feed my 3,320,320 pictures in an existing ResNet model. The model actually expects input of size 3,32,32. As I am afraid of loosing information I don't simply want to resize my pictures.
What is the best way to preprocess my images, so that they are able to run on the ResNet34?
Should I add additional layers in the forward method of ResNet? If yes, what would be a suitable combination in my case?
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_fitmodule import FitModule
from torch.autograd import Variable
import numpy as np
def conv3x3(in_planes, out_planes, stride=1):
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
class BasicBlock(FitModule):
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(in_planes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes,
kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x)
out = F.relu(out)
return out
class ResNet(FitModule):
def __init__(self, block, num_blocks, num_classes=10):
super(ResNet, self).__init__()
self.in_planes = 64
self.conv1 = conv3x3(3, 64)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x): # add additional layers here?
x = x.float()
out = F.relu(self.bn1(self.conv1(x).float()).float())
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def ResNet34():
return ResNet(BasicBlock, [3, 4, 6, 3])
Thanks plenty!

If you change your avg_pool operation to 'AdaptiveAvgPool2d' your model will work for any image size.
However with your current setup, your 320x320 images would be 40x40 going into the pooling stage, which is a large feature map to pool over. Consider adding more conv layers.


RuntimeError: Given transposed=1, weight of size [64, 1, 4], expected input[2, 128, 74] to have 64 channels, but got 128 channels instead > unet error

I’m trying to implement very very simple UNET from this code.
class unet(nn.Module):
def __init__(self, ngf=64, norm_layer=nn.BatchNorm1d):
super(unet, self).__init__()
# construct unet structure
unet_block = skipconnection_block(ngf*2, ngf, submodule=None, norm_layer=norm_layer, inner=True)
unet_block = skipconnection_block(ngf, 1, submodule=unet_block, norm_layer=norm_layer, outer=True)
self.model = unet_block
def forward(self, x):
self.unet = nn.Sequential(self.model)
x = self.unet(x)
return x
class skipconnection_block(nn.Module):
def __init__(self, inner_nc, outer_nc, submodule=None, outer=False, inner=False, norm_layer=nn.BatchNorm1d):
super(skipconnection_block, self).__init__()
self.outer = outer
downrelu = nn.LeakyReLU(0.2, True)
uprelu = nn.ReLU(True)
if inner:
downconv_0 = nn.Conv1d(in_channels=outer_nc, out_channels=inner_nc, kernel_size=4, stride=2, padding=0)
upconv_0 = nn.ConvTranspose1d(in_channels=inner_nc, out_channels=outer_nc, kernel_size=4, stride=2, padding=0)
down = [downrelu, downconv_0]
up = [uprelu, upconv_0, norm_layer(outer_nc)]
model = down + up
elif outer:
downconv_1 = nn.Conv1d(in_channels=outer_nc, out_channels=inner_nc, kernel_size=4, stride=2, padding=0)
upconv_1 = nn.ConvTranspose1d(in_channels=inner_nc, out_channels=outer_nc, kernel_size=4, stride=2, padding=0)
down = [downrelu, downconv_1, norm_layer(inner_nc)]
up = [uprelu, upconv_1, norm_layer(outer_nc)]
model = down + [submodule] + up
self.model = nn.Sequential(*model)
def forward(self, x):
if self.outer:
return self.model(x)
return[x, self.model(x)], 1)
and when i tried like this for checking summary architecture of unet,
unet = load_skip_model()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(torchsummary.summary(unet, (1, 150)))
i got this result below.
RuntimeError: Given transposed=1, weight of size [64, 1, 4], expected input[2, 128, 74] to have 64 channels, but got 128 channels instead
I dont understand why i got this.
can anyone please please give some help…?? thank uu
As a rule of thumb, when you define a neural network like so, first check your layer in/out dims one by one.
Here for example, your first conv1d does not receive the expected dimension as input which cause you error.

Mobilenet as feature backbone to use Resnet18 pretrained model using Pytorch

I have a resnet18 pretrained model, now I want to change as feature backbone into MobileNet using pytorch , please suggest any optimal way is available or not to implement this.
In the below code I want to use backbone mobilenet instead of resnet as feature extraction
import torch
from model.backbone import resnet
import numpy as np
class conv_bn_relu(torch.nn.Module):
def init(self,in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1,bias=False):
self.conv = torch.nn.Conv2d(in_channels,out_channels, kernel_size,
stride = stride, padding = padding, dilation = dilation,bias = bias) = torch.nn.BatchNorm2d(out_channels)
self.relu = torch.nn.ReLU()
def forward(self,x):
x = self.conv(x)
x =
x = self.relu(x)
return x
class parsingNet(torch.nn.Module):
def init(self, size=(288, 800), pretrained=True, backbone='50', cls_dim=(37, 10, 4), use_aux=False):
super(parsingNet, self).init()
self.size = size
self.w = size[0]
self.h = size[1]
self.cls_dim = cls_dim # (num_gridding, num_cls_per_lane, num_of_lanes)
# num_cls_per_lane is the number of row anchors
self.use_aux = use_aux
self.total_dim =
# input : nchw,
# output: (w+1) * sample_rows * 4
self.model = resnet(backbone, pretrained=pretrained)
if self.use_aux:
self.aux_header2 = torch.nn.Sequential(
conv_bn_relu(128, 128, kernel_size=3, stride=1, padding=1) if backbone in ['34','18'] else conv_bn_relu(512, 128, kernel_size=3, stride=1, padding=1),
self.aux_header3 = torch.nn.Sequential(
conv_bn_relu(256, 128, kernel_size=3, stride=1, padding=1) if backbone in ['34','18'] else conv_bn_relu(1024, 128, kernel_size=3, stride=1, padding=1),
self.aux_header4 = torch.nn.Sequential(
conv_bn_relu(512, 128, kernel_size=3, stride=1, padding=1) if backbone in ['34','18'] else conv_bn_relu(2048, 128, kernel_size=3, stride=1, padding=1),
self.aux_combine = torch.nn.Sequential(
conv_bn_relu(384, 256, 3,padding=2,dilation=2),
conv_bn_relu(256, 128, 3,padding=2,dilation=2),
conv_bn_relu(128, 128, 3,padding=2,dilation=2),
conv_bn_relu(128, 128, 3,padding=4,dilation=4),
torch.nn.Conv2d(128, cls_dim[-1] + 1,1)
# output : n, num_of_lanes+1, h, w
self.cls = torch.nn.Sequential(
torch.nn.Linear(1800, 2048),
torch.nn.Linear(2048, self.total_dim),
self.pool = torch.nn.Conv2d(512,8,1) if backbone in ['34','18'] else torch.nn.Conv2d(2048,8,1)
# 1/32,2048 channel
# 288,800 -> 9,40,2048
# (w+1) * sample_rows * 4
# 37 * 10 * 4
def forward(self, x):
# n c h w - > n 2048 sh sw
# -> n 2048
x2,x3,fea = self.model(x)
if self.use_aux:
x2 = self.aux_header2(x2)
x3 = self.aux_header3(x3)
x3 = torch.nn.functional.interpolate(x3,scale_factor = 2,mode='bilinear')
x4 = self.aux_header4(fea)
x4 = torch.nn.functional.interpolate(x4,scale_factor = 4,mode='bilinear')
aux_seg =[x2,x3,x4],dim=1)
aux_seg = self.aux_combine(aux_seg)
aux_seg = None
fea = self.pool(fea).view(-1, 1800)
group_cls = self.cls(fea).view(-1, *self.cls_dim)
if self.use_aux:
return group_cls, aux_seg
return group_cls
def initialize_weights(*models):
for model in models:
def real_init_weights(m):
if isinstance(m, list):
for mini_m in m:
if isinstance(m, torch.nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
if m.bias is not None:
torch.nn.init.constant_(m.bias, 0)
elif isinstance(m, torch.nn.Linear):, std=0.01)
elif isinstance(m, torch.nn.BatchNorm2d):
torch.nn.init.constant_(m.weight, 1)
torch.nn.init.constant_(m.bias, 0)
elif isinstance(m,torch.nn.Module):
for mini_m in m.children():
print('unkonwn module', m)

How can I connect between layers in two network by python and pytorch?

For speaker verification from speech, I want to impellent bellow figure. I have some question please guide me.
First I am defining a class for first 4 layers (Code model #1). Then I am using ECAPA-TDNN code from github (code model #2 – link: ). I don’t know how can I connect between two models 1 and 2 or in the other hand, how can I impelement bellow figure?
Model #1 : The first 4 layer code:
import torch.nn as nn
class NeuralNetwork(nn.Module):
def __init__(self, num_class):
super(NeuralNetwork, self).__init__()
self.conv1 = nn.Sequential(nn.Conv2d(1, 80, T),
MaxPool2d(kernel_size=2, stride=2))
self.conv2 = nn.Sequential(nn.Conv2d(128, 40, T),
MaxPool2d(kernel_size=2, stride=1))
self.conv3 = nn.Sequential(nn.Conv2d(128, 40, T),
MaxPool2d(kernel_size=2, stride=1))
self.conv4 = nn.Sequential(nn.Conv2d(128, 40, T),
MaxPool2d(kernel_size=2, stride=1))
self.conv5 = nn.Sequential(nn.Conv2d(128, 20, T = flatten),
MaxPool2d(kernel_size=2, stride=2))
Model #2 : ECAPA-TDNN form git-hub:
import math, torch, torchaudio
import torch.nn as nn
import torch.nn.functional as F
class ECAPA_TDNN(nn.Module):
def __init__(self, C):
super(ECAPA_TDNN, self).__init__()
self.torchfbank = torch.nn.Sequential(
torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
self.specaug = FbankAug() # Spec augmentation
self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
self.relu = nn.ReLU()
self.bn1 = nn.BatchNorm1d(C)
self.layer1 = Bottle2neck(C, C, kernel_size=3, dilation=2, scale=8)
self.layer2 = Bottle2neck(C, C, kernel_size=3, dilation=3, scale=8)
self.layer3 = Bottle2neck(C, C, kernel_size=3, dilation=4, scale=8)
# I fixed the shape of the output from MFA layer, that is close to the setting from ECAPA paper.
self.layer4 = nn.Conv1d(3*C, 1536, kernel_size=1)
self.attention = nn.Sequential(
nn.Conv1d(4608, 256, kernel_size=1),
nn.Tanh(), # I add this layer
nn.Conv1d(256, 1536, kernel_size=1),
self.bn5 = nn.BatchNorm1d(3072)
self.fc6 = nn.Linear(3072, 192)
self.bn6 = nn.BatchNorm1d(192)
def forward(self, x, aug):
with torch.no_grad():
x = self.torchfbank(x)+1e-6
x = x.log()
x = x - torch.mean(x, dim=-1, keepdim=True)
if aug == True:
x = self.specaug(x)
x = self.conv1(x)
x = self.relu(x)
x = self.bn1(x)
x1 = self.layer1(x)
x2 = self.layer2(x+x1)
x3 = self.layer3(x+x1+x2)
x = self.layer4(,x2,x3),dim=1))
x = self.relu(x)
t = x.size()[-1]
global_x =,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1)
w = self.attention(global_x)
mu = torch.sum(x * w, dim=2)
sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )
x =,sg),1)
x = self.bn5(x)
x = self.fc6(x)
x = self.bn6(x)
return x
2. I am using log Mel filterbank energy for feature extraction. My dimension isn’t 80. How can I define 80 for dimension?(by batch-size? How?)
3. C is constant (128) but how can I calculate T? for one speech signal I used bellow code But for all speech in my dataset, second code didn’t work. Or T is another thing?
# Number of Channel
ch = file_info.channels('/home/narges/Project-SV-1400.11.20/S.V-Code/dataset/train/id10292/FXxcN18rX7c/00001.wav')
print (ch)
# Number of Frame in One Audio File
frame_one_wav = audiosegment.from_file('/home/narges/Project-SV-1400.11.20/S.V-Code/dataset/train/id10292/FXxcN18rX7c/00001.wav')
print (frame_one_wav)
# All Frame
all_frame = []
for i in range (number_of_files_train):
wavs_info = rate_train, sig_train =[i])
frame_wav = audiosegment.from_file(wavs_info)

How to obtain sequence of submodules from a pytorch module?

For a pytorch module, I suppose I could use .named_children, .named_modules, etc. to obtain a list of the submodules. However, I suppose the list is not given in order, right? An example:
In [19]: import transformers
In [20]: model = transformers.DistilBertForSequenceClassification.from_pretrained('distilb
...: ert-base-cased')
In [21]: [name for name, _ in model.named_children()]
Out[21]: ['distilbert', 'pre_classifier', 'classifier', 'dropout']
The order of .named_children() in the above model is given as distilbert, pre_classifier, classifier, and dropout. However, if you examine the code, it is evident that dropout happens before classifier. So how do I get the order of these submodules?
In Pytorch, the results of print(model) or .named_children(), etc are listed based on the order they are declared in __init__ of the model's class e.g.
Case 1
class Model(nn.Module):
def __init__(self):
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
self.conv2_drop = nn.Dropout2d()
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, p=0.6)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
model = Model()
[name for name, _ in model.named_children()]
# output
['conv1', 'conv2', 'fc1', 'fc2', 'conv2_drop']
Case 2
Changed order of fc1 and fc2 layers in constructor.
class Model(nn.Module):
def __init__(self):
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.fc2 = nn.Linear(50, 10)
self.fc1 = nn.Linear(320, 50)
self.conv2_drop = nn.Dropout2d()
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, p=0.6)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
model = Model()
[name for name, _ in model.named_children()]
# output
['conv1', 'conv2', 'fc2', 'fc1', 'conv2_drop']
That's why classifier is printed before dropout as it's declared so in constructor:
class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.distilbert = DistilBertModel(config)
self.pre_classifier = nn.Linear(config.dim, config.dim)
self.classifier = nn.Linear(config.dim, config.num_labels)
self.dropout = nn.Dropout(config.seq_classif_dropout)
Nevertheless, you can play with model's submodules using .modules(), etc. but they'll be listed only in the order they are declared in __init__. If you only want to print structure based on forward method, you may try using pytorch-summary.

Variation between custom convolution vs pytorch conv2d results?

I am trying to build a custom convolution using the method shown in pytorch unfold function
The custom convolution function is given below:
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
import math
from torch.nn.modules.utils import _pair
class customConv(nn.Module):
def __init__(self, n_channels, out_channels, kernel_size, dilation=1, padding=0, stride=1, bias=True):
super(customConv, self).__init__()
self.kernel_size = _pair(kernel_size)
self.out_channels = out_channels
self.dilation = _pair(dilation)
self.padding = _pair(padding)
self.stride = _pair(stride)
self.n_channels = n_channels
self.weight = Parameter(torch.Tensor(self.out_channels, self.n_channels, self.kernel_size[0], self.kernel_size[1]))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
self.register_parameter('bias', None)
def reset_parameters(self):
n = self.n_channels
for k in self.kernel_size:
n *= k
stdv = 1. / math.sqrt(n), stdv)
if self.bias is not None:, stdv)
def forward(self, input_):
hout = ((input_.shape[2] + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0]-1)-1)//self.stride[0])+1
wout = ((input_.shape[3] + 2 * self.padding[1] - self.dilation[1] * (self.kernel_size[1]-1)-1)//self.stride[1])+1
inputUnfolded = F.unfold(input_, kernel_size=self.kernel_size, padding=self.padding, dilation=self.dilation, stride=self.stride)
if self.bias:
convolvedOutput = (inputUnfolded.transpose(1, 2).matmul(
self.weight.view(self.weight.size(0), -1).t()).transpose(1, 2)) + self.bias.view(-1, 1)
convolvedOutput = (inputUnfolded.transpose(1, 2).matmul(self.weight.view(self.weight.size(0), -1).t()).transpose(1, 2))
convolutionReconstruction = convolvedOutput.view(input_.shape[0], self.out_channels, hout, wout)
return convolutionReconstruction
But when I try comparing it with the pytorch implementation, I do not get the exact value. The code to check for difference is provided below
import torch
from torch import nn
from customConvolve import customConv
input = torch.randn (10,3,64,64)
conv1 = nn.Conv2d(input.shape[1],5, kernel_size=3, dilation=1, padding=1, stride=1 ,bias = False)
conv1_output = conv1(input)
conv2 = customConv(n_channels=input.shape[1], out_channels=5, kernel_size=3, dilation=1, stride =1, padding = 1, bias = False)
conv2_output = conv2(input)
print(torch.equal(conv1_output, conv2_output))
I would like to know why the variation exists and how to solve this?
Thank you.
