How to reshape last layer of pytorch CNN model while doing transfer learning - keras

Actually i am trying to replicate keras structure to pytorch.(new in pytorch).
Here is keras architecture
base_model = InceptionV3(weights='imagenet', include_top=False)
x = base_model.output
x = Dense(512, activation='relu')(x)
predictions = Dense(49*6,activation='sigmoid')(x)
reshape=Reshape((49,6))(predictions)
model = Model(inputs=base_model.input, outputs=reshape)
for layer in base_model.layers:
layer.trainable = False
I want to reshape last layer of my netwrok. I have implemented transfer learning.
model = models.inception_v3(pretrained=True)
for param in model.parameters():
param.requires_grad = False
num_ftrs = model.fc.in_features
I believe if i can attach last layer of resnet with my following architecture, problem can be solved. But i dont know how i can attach them
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.fc1 = nn.Linear(num_ftrs, 512)
self.fc2 = nn.Linear(512, 49*6)
def forward(self, x):
print (x.shape)
x = x.view(-1,num_ftrs)
#print (x.shape)
x = F.relu(self.fc1(x))
x = self.fc2(x)
x=torch.sigmoid(x.view(49,6))
return x
Any idea, how this problem can be resolved

Related

How can I train the last few layers of RegNet-800MF backbone using PyTorch Lightning

I am trying to get better results by allowing a few final layers of a previously frozen backbone (RegNet-800MF) to be trained. How can I implement this in PyTorch Lightning? I am very new to ML so please excuse me if I have left any important information out.
My model (MechClassifier) calls another class (ParametersClassifier) which includes the pre-trained RegNet as its frozen backbone. During training the forward function passes inputs only through the backbone of the ParametersClassifier and not the Classifying layers. I will include the init functions of both below.
My MechClassifier model:
class MechClassifier(pl.LightningModule):
def __init__(
self,
num_classes,
lr=4e-3,
weight_decay=1e-8,
gpus=1,
max_epochs=30,
):
super().__init__()
self.lr = lr
self.weight_decay = weight_decay
self.__dict__.update(locals())
self.backbone = ParametersClassifier.load_from_checkpoint(
checkpoint_path="checkpoints/param_classifier/last.ckpt",
num_classes=3,
gpus=1,
)
self.backbone.freeze()
self.backbone.eval()
self.mf_classifier = nn.Sequential(
nn.Linear(self.backbone.num_ftrs, 8),
nn.ReLU(),
nn.Linear(8, num_classes),
)
self.wd_classifier = nn.Sequential(
nn.Linear(self.backbone.num_ftrs, 8),
nn.ReLU(),
nn.Linear(8, num_classes),
)
def forward(self, x):
self.backbone.eval()
with torch.no_grad():
x = self.backbone.model(x)
# x = self.model(x)
out1 = self.mf_classifier(x)
out2 = self.wd_classifier(x)
# print(out1.size())
return (out1, out2)
ParametersClassifier (loaded from checkpoint):
class ParametersClassifier(pl.LightningModule):
def __init__(
self,
num_classes,
lr=4e-3,
weight_decay=0.05,
gpus=1,
max_epochs=30,
):
super().__init__()
self.lr = lr
self.weight_decay = weight_decay
self.__dict__.update(locals())
self.model = models.regnet_y_800mf(pretrained=True)
self.num_ftrs = self.model.fc.in_features
self.model.fc = nn.Identity()
self.fc1 = nn.Linear(self.num_ftrs, num_classes)
self.fc2 = nn.Linear(self.num_ftrs, num_classes)
self.fc3 = nn.Linear(self.num_ftrs, num_classes)
self.fc4 = nn.Linear(self.num_ftrs, num_classes)
def forward(self, x):
x = self.model(x)
out1 = self.fc1(x)
out2 = self.fc2(x)
out3 = self.fc3(x)
out4 = self.fc4(x)
return (out1, out2, out3, out4)
You can look at the implementation for the Regnet model you are using here. Its forward function:
def forward(self, x: Tensor) -> Tensor:
x = self.stem(x)
x = self.trunk_output(x)
x = self.avgpool(x)
x = x.flatten(start_dim=1)
x = self.fc(x)
return x
Instead of using a torch.no_grad context manager as you did, you should rather switch on/off the requires_grad as necessary. By default module parameters have their requires_grad flag set to True which means they are able to perform gradient computation. If this flag is set to False, you can consider those components as frozen.
Depending on which layers you want to freeze and those that you want to finetune, you can manually do that. For example, if you want to freeze the backbone and finetune the fully connected layer of the Regnet, and replace the following from MechClassifier's __init__:
self.backbone.freeze()
self.backbone.eval()
With the following lines:
## freeze all
self.backbone.model.requires_grad_(False)
## unfreeze last section of 4th block of backbone
block4_section1 = getattr(self.backbone.model.trunk_output.block4, 'block4-1')
block4_section1.requires_grad_(True)
And perform inference on MechClassifier with a forward function like so:
def forward(self, x):
self.backbone.eval()
x = self.backbone.model(x)
out1 = self.mf_classifier(x)
out2 = self.wd_classifier(x)
return (out1, out2)

replace BERT architure with LSTM

I want to train my dataset on Bert and modify the following architecture to LSTM
the full code in here https://www.geeksforgeeks.org/fine-tuning-bert-model-for-sentiment-analysis/
class BERT_architecture(nn.Module):
def __init__(self, bert):
super(BERT_architecture, self).__init__()
self.bert = bert
# dropout layer
self.dropout = nn.Dropout(0.2)
# relu activation function
self.relu = nn.ReLU()
# dense layer 1
self.fc1 = nn.Linear(768,512)
# dense layer 2 (Output layer)
self.fc2 = nn.Linear(512,2)
#softmax activation function
self.softmax = nn.LogSoftmax(dim=1)
#define the forward pass
def forward(self, sent_id, mask):
#pass the inputs to the model
_, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
x = self.fc1(cls_hs)
x = self.relu(x)
x = self.dropout(x)
# output layer
x = self.fc2(x)
# apply softmax activation
x = self.softmax(x)
return x

Pretrained ResNet-50 on ImageNet as CAE encoder performs very poor

I am experementing with different Convolutional Autoencoder Arcitectures now and I have decided to try pretrained ResnNet50 network as encoder in my model. I tried to options: use encoder without changing weights and use encoder using pretrained weights as initial. I had better results of reconstructing training weights of ResNet, but it still cannot outperform my basic CAE with 3 conv layer in encoder and 3 upsample + conv2d layers in decoder. Can you help me with the explanation of such effect? I am training my network on different domains of OfficeHome dataset. Here is my arcitecture:
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super(ConvBlock, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)
self.relu = nn.LeakyReLU(inplace=True)
self.norm = nn.BatchNorm2d(out_channels)
def forward(self, x):
x = self.conv(x)
x = self.relu(x)
x = self.norm(x)
return x
class ResnetConvAutoencoder(nn.Module):
def __init__(self):
super(ResnetConvAutoencoder, self).__init__()
resnet = models.resnet50(pretrained = True, progress = True)
# for param in resnet.parameters():
# param.requires_grad = False
self.encoder = torch.nn.Sequential(*(list(resnet.children())[:-2])) #output is 2048 x 8 x 8
del resnet
self.decoder = nn.Sequential(
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(2048, 512),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(512, 256),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(256, 128),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(128, 16),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(16, 3),
nn.Sigmoid()
) # output is 3 x 256 x 256 (as input)
self.encoder.cuda(2) # trainig on several GPUs
self.decoder.cuda(1)
def forward(self, x):
x = x.cuda(2)
x = self.encoder(x).cuda(1)
x = self.decoder(x).cuda(1)
return x
And some images examples of my simple CAE and CAE with ResNet encoder
Loss on train and test in CAE with ResNet-50 is about 0.03 and do not go lower. Loss in my CAE is lower than 0.015 (MSE).

Different training result obtained from training simple LSTM in Keras and Pytorch

I’m trying to implement my LSTM model from Keras to Pytorch, but the results in Pytorch seem really bad at the moment. The network is really simple as below.
model = Sequential()
model.add(LSTM(10, input_length=shape[1], input_dim=shape[2]))
# output shape: (1, 1)
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(10,activation="tanh"))
model.add(Dense(1,activation="linear"))
model.compile(loss="mse", optimizer="adam")
model.summary()
And I migrate it to the Pytorch framework,
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, num_layers, output_dim,bilstm=False):
super(LSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.isBi = bilstm
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True,bidirectional=bilstm).double()
# for name, param in self.lstm.named_parameters():
# if name.startswith("weight"):
# nn.init.orthogonal_(param)
# else:
# pass
self.fc1 = nn.Sequential(nn.Linear(hidden_dim, 10).double(),nn.Tanh())
self.final_layer1 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer2 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer3 = nn.Sequential(nn.Linear(10,10).double(),nn.Tanh())
self.final_layer4 = nn.Sequential(nn.Linear(10,output_dim).double())
def forward(self, x):
out, (hn, cn) = self.lstm(x)
out = out[:, -1, :]
out = self.fc1(out)
out = self.final_layer1(out)
out = self.final_layer2(out)
out = self.final_layer3(out)
out = self.final_layer4(out)
return out
The result is really bad. I was wondering if the initializing methods/activation functions used in Keras are different from the one I used in Pytorch(Keras seems to be using hard_sigmoid where Pytorch uses sigmoid?).
Would really appreciate it if somebody could help me with this problem!
UPDATED
My training code in Pytorch.
criterion = nn.MSELoss()
model = LSTM(input_dim,hidden_dim,num_layers,output_dim,bilstm)
model = model.cuda()
optimizer = optim.Adam(model.parameters(),lr=0.001)
for epoch in range(1,epoch_number+1):
model.train()
iteration = 0
for i,data in enumerate(train_loader):
dat, label = data
dat = dat.double()
label = label.double()
if torch.cuda.is_available():
dat = dat.cuda()
label = label.cuda()
else:
dat = Variable(dat)
label = Variable(label)
out = model(dat)
optimizer.zero_grad()
loss = criterion(out, label)
loss.backward()
optimizer.step()

regarding a design of Train() function

I once saw the following implementation of neural network. I am confused about the model.train() in the function of Train() . In class CNN_ForecastNet, I do not find the method of train,
class CNN_ForecastNet(nn.Module):
def __init__(self):
super(CNN_ForecastNet,self).__init__()
self.conv1d = nn.Conv1d(3,64,kernel_size=1)
self.relu = nn.ReLU(inplace=True)
self.fc1 = nn.Linear(64*2,50)
self.fc2 = nn.Linear(50,1)
def forward(self,x):
x = self.conv1d(x)
x = self.relu(x)
x = x.view(-1)
#print('x size',x.size())
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = CNN_ForecastNet().to(device)
def Train():
running_loss = .0
model.train()
for idx, (inputs,labels) in enumerate(train_loader):
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
preds = model(inputs.float())
loss = criterion(preds,labels.float())
loss.backward()
optimizer.step()
running_loss += loss
train_loss = running_loss/len(train_loader)
train_losses.append(train_loss.detach().numpy())
print(f'train_loss {train_loss}')
As you can find in the documentation, the module train function just set a flag in the model to True (you can use model.eval() to set the flag to False).
This flag is used by some layers for which the behavior changes in eval mode, most notably the dropout and batchnorm layers.

Resources