replace BERT architure with LSTM - nlp

I want to train my dataset on Bert and modify the following architecture to LSTM
the full code in here https://www.geeksforgeeks.org/fine-tuning-bert-model-for-sentiment-analysis/
class BERT_architecture(nn.Module):
def __init__(self, bert):
super(BERT_architecture, self).__init__()
self.bert = bert
# dropout layer
self.dropout = nn.Dropout(0.2)
# relu activation function
self.relu = nn.ReLU()
# dense layer 1
self.fc1 = nn.Linear(768,512)
# dense layer 2 (Output layer)
self.fc2 = nn.Linear(512,2)
#softmax activation function
self.softmax = nn.LogSoftmax(dim=1)
#define the forward pass
def forward(self, sent_id, mask):
#pass the inputs to the model
_, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
x = self.fc1(cls_hs)
x = self.relu(x)
x = self.dropout(x)
# output layer
x = self.fc2(x)
# apply softmax activation
x = self.softmax(x)
return x

Related

How can I train the last few layers of RegNet-800MF backbone using PyTorch Lightning

I am trying to get better results by allowing a few final layers of a previously frozen backbone (RegNet-800MF) to be trained. How can I implement this in PyTorch Lightning? I am very new to ML so please excuse me if I have left any important information out.
My model (MechClassifier) calls another class (ParametersClassifier) which includes the pre-trained RegNet as its frozen backbone. During training the forward function passes inputs only through the backbone of the ParametersClassifier and not the Classifying layers. I will include the init functions of both below.
My MechClassifier model:
class MechClassifier(pl.LightningModule):
def __init__(
self,
num_classes,
lr=4e-3,
weight_decay=1e-8,
gpus=1,
max_epochs=30,
):
super().__init__()
self.lr = lr
self.weight_decay = weight_decay
self.__dict__.update(locals())
self.backbone = ParametersClassifier.load_from_checkpoint(
checkpoint_path="checkpoints/param_classifier/last.ckpt",
num_classes=3,
gpus=1,
)
self.backbone.freeze()
self.backbone.eval()
self.mf_classifier = nn.Sequential(
nn.Linear(self.backbone.num_ftrs, 8),
nn.ReLU(),
nn.Linear(8, num_classes),
)
self.wd_classifier = nn.Sequential(
nn.Linear(self.backbone.num_ftrs, 8),
nn.ReLU(),
nn.Linear(8, num_classes),
)
def forward(self, x):
self.backbone.eval()
with torch.no_grad():
x = self.backbone.model(x)
# x = self.model(x)
out1 = self.mf_classifier(x)
out2 = self.wd_classifier(x)
# print(out1.size())
return (out1, out2)
ParametersClassifier (loaded from checkpoint):
class ParametersClassifier(pl.LightningModule):
def __init__(
self,
num_classes,
lr=4e-3,
weight_decay=0.05,
gpus=1,
max_epochs=30,
):
super().__init__()
self.lr = lr
self.weight_decay = weight_decay
self.__dict__.update(locals())
self.model = models.regnet_y_800mf(pretrained=True)
self.num_ftrs = self.model.fc.in_features
self.model.fc = nn.Identity()
self.fc1 = nn.Linear(self.num_ftrs, num_classes)
self.fc2 = nn.Linear(self.num_ftrs, num_classes)
self.fc3 = nn.Linear(self.num_ftrs, num_classes)
self.fc4 = nn.Linear(self.num_ftrs, num_classes)
def forward(self, x):
x = self.model(x)
out1 = self.fc1(x)
out2 = self.fc2(x)
out3 = self.fc3(x)
out4 = self.fc4(x)
return (out1, out2, out3, out4)
You can look at the implementation for the Regnet model you are using here. Its forward function:
def forward(self, x: Tensor) -> Tensor:
x = self.stem(x)
x = self.trunk_output(x)
x = self.avgpool(x)
x = x.flatten(start_dim=1)
x = self.fc(x)
return x
Instead of using a torch.no_grad context manager as you did, you should rather switch on/off the requires_grad as necessary. By default module parameters have their requires_grad flag set to True which means they are able to perform gradient computation. If this flag is set to False, you can consider those components as frozen.
Depending on which layers you want to freeze and those that you want to finetune, you can manually do that. For example, if you want to freeze the backbone and finetune the fully connected layer of the Regnet, and replace the following from MechClassifier's __init__:
self.backbone.freeze()
self.backbone.eval()
With the following lines:
## freeze all
self.backbone.model.requires_grad_(False)
## unfreeze last section of 4th block of backbone
block4_section1 = getattr(self.backbone.model.trunk_output.block4, 'block4-1')
block4_section1.requires_grad_(True)
And perform inference on MechClassifier with a forward function like so:
def forward(self, x):
self.backbone.eval()
x = self.backbone.model(x)
out1 = self.mf_classifier(x)
out2 = self.wd_classifier(x)
return (out1, out2)

How to extract the encoded features after running a PyTorch LSTM autoencoder model?

I am very new to PyTorch and Python in general, and I am now struggling to get the encoded features from my pre-trained LSTM autoencoder which can be seen below:
import torch
import torch.nn as nn
# Bulding an LSTM autoencoder
class Encoder(nn.Module):
def __init__(self, seq_len, n_features, embedding_dim=32):
super(Encoder, self).__init__()
self.seq_len, self.n_features = seq_len, n_features
self.embedding_dim, self.hidden_dim1, self.hidden_dim2 = embedding_dim, 4 * embedding_dim, 2* embedding_dim
self.rnn1 = nn.LSTM(
input_size=n_features,
hidden_size=self.hidden_dim1, #128
num_layers=1,
batch_first=True
)
self.rnn2 = nn.LSTM(
input_size=self.hidden_dim1,
hidden_size=self.hidden_dim2, #64
num_layers=1,
batch_first=True
)
self.rnn3 = nn.LSTM(
input_size=self.hidden_dim2,
hidden_size=embedding_dim, #32
num_layers=1,
batch_first=True
)
def forward(self, x):
x = x.reshape((1, self.seq_len, self.n_features))
x, (_, _) = self.rnn1(x)
x, (_, _) = self.rnn2(x)
x, (hidden_n, _) = self.rnn3(x)
return hidden_n.reshape((self.n_features, self.embedding_dim))
class Decoder(nn.Module):
def __init__(self, seq_len, input_dim=32, n_features=1):
super(Decoder, self).__init__()
self.seq_len, self.input_dim = seq_len, input_dim
self.hidden_dim2, self.hidden_dim1, self.n_features = 4 * input_dim,2 * input_dim, n_features
self.rnn1 = nn.LSTM(
input_size=input_dim,
hidden_size=input_dim,
num_layers=1,
batch_first=True
)
self.rnn2 = nn.LSTM(
input_size=input_dim,
hidden_size=self.hidden_dim1,
num_layers=1,
batch_first=True
)
self.rnn3 = nn.LSTM(
input_size=self.hidden_dim1,
hidden_size=self.hidden_dim2,
num_layers=1,
batch_first=True
)
self.output_layer = nn.Linear(self.hidden_dim2, n_features)
def forward(self, x):
x = x.repeat(self.seq_len, self.n_features)
x = x.reshape((self.n_features, self.seq_len, self.input_dim))
x, (hidden_n, cell_n) = self.rnn1(x)
x, (hidden_n, cell_n) = self.rnn2(x)
x, (hidden_n, cell_n) = self.rnn3(x)
x = x.reshape((self.seq_len, self.hidden_dim2))
return self.output_layer(x)
class RAE(nn.Module):
def __init__(self,seq_len, n_features, embedding_dim=32):
super(RAE, self).__init__()
self.seq_len, self.n_features = seq_len, n_features
self.embedding_dim = embedding_dim
self.encoder = Encoder (seq_len, n_features, embedding_dim).to(device)
self.decoder = Decoder (seq_len, embedding_dim, n_features).to(device)
def forward(self,x):
x = self.encoder(x)
x = self.decoder(x)
return x
### TRAINING
def train_model(model,train_dataset,val_dataset, n_epochs):
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.MSELoss(reduction='mean').to(device) # nn.L1Loss sum
history = dict(train = [], val = [])
for epoch in range(1, n_epochs + 1):
model = model.train()
train_losses = []
for seq_true in train_dataset:
optimizer.zero_grad()
seq_true = seq_true.to(device)
seq_pred = model(seq_true)
loss = criterion(seq_pred, seq_true)
loss.backward()
optimizer.step()
train_losses.append(loss.item())
val_losses = []
model = model.eval()
with torch.no_grad():
for seq_true in val_dataset:
seq_true = seq_true.to(device)
seq_pred =model(seq_true)
loss = criterion(seq_pred, seq_true)
val_losses.append(loss.item())
#add accuracy
train_loss = np.mean(train_losses)
val_loss = np.mean(val_losses)
history['train'].append(train_loss)
history['val'].append(val_loss)
print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')
return model.eval(),history
Once I trained my model I followed the advice given by ptrblck here and implemented it as follows:
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
model.encoder.register_forward_hook(get_activation('encoder'))
x = test_dataset_SR[1] # instead of using his random example I used one example from my training set
x = x.cuda()
output = model(x)
print(activation['encoder'])
but this gives me this error:
2 def get_activation(name):
3 def hook(model, input, output):
----> 4 activation[name] = output.detach()
5 return hook
AttributeError: 'tuple' object has no attribute 'detach'
Can you please help me solve this issue? I want to take these encoded features, store them and use them as input to another network. I know I could probably train the encoder separately(not sure), but I will need both encoder and decoder so I thought hooks will be my salvation.

PyTorch multi-class: ValueError: Expected input batch_size (416) to match target batch_size (32)

I have created a mutli-class classification neural network. Training, and validation iterators where created with BigBucketIterator method with fields {'text_normalized_tweet':TEXT, 'label': LABEL}
TEXT = a tweet
LABEL = a float number (with 3 values: 0,1,2)
Below I execute a dummy example of my neural network:
import torch.nn as nn
class MultiClassClassifer(nn.Module):
#define all the layers used in model
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
#Constructor
super(MultiClassClassifer, self).__init__()
#embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
#dense layer
self.hiddenLayer = nn.Linear(embedding_dim, hidden_dim)
#Batch normalization layer
self.batchnorm = nn.BatchNorm1d(hidden_dim)
#output layer
self.output = nn.Linear(hidden_dim, output_dim)
#activation layer
self.act = nn.Softmax(dim=1) #2d-tensor
#initialize weights of embedding layer
self.init_weights()
def init_weights(self):
initrange = 1.0
self.embedding.weight.data.uniform_(-initrange, initrange)
def forward(self, text, text_lengths):
embedded = self.embedding(text)
#packed sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
tensor, batch_size = packed_embedded[0], packed_embedded[1]
hidden_1 = self.batchnorm(self.hiddenLayer(tensor))
return self.act(self.output(hidden_1))
Instantiate the model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
OUTPUT_DIM = 3
model = MultiClassClassifer(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
When I call
text, text_lengths = batch.text_normalized_tweet
predictions = model(text, text_lengths).squeeze()
loss = criterion(predictions, batch.label)
it returns,
ValueError: Expected input batch_size (416) to match target batch_size (32).
model(text, text_lengths).squeeze() = torch.Size([416, 3])
batch.label = torch.Size([32])
I can see that the two objects have different sizes, but I have no clue how to fix this?
You may find the Google Colab notebook here
Shapes of each in, out tensor of my forward() method:
torch.Size([32, 10, 100]) #self.embedding(text)
torch.Size([320, 100]) #nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
torch.Size([320, 64]) #self.batchnorm(self.hiddenLayer(tensor))
torch.Size([320, 3]) #self.act(self.output(hidden_1))
You shouldn't be using the squeeze function after the forward pass, that doesn't make sense.
After removing the squeeze function, as you see, the shape of your final output is [320,3] whereas it is expecting [32,3]. One way to fix this is to average out the embeddings you obtain for each word after the self.Embedding function like shown below:
def forward(self, text, text_lengths):
embedded = self.embedding(text)
embedded = torch.mean(embedded, dim=1, keepdim=True)
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
tensor, batch_size = packed_embedded[0], packed_embedded[1]
hidden_1 = self.batchnorm(self.hiddenLayer(tensor))
return self.act(self.output(hidden_1))

Pretrained ResNet-50 on ImageNet as CAE encoder performs very poor

I am experementing with different Convolutional Autoencoder Arcitectures now and I have decided to try pretrained ResnNet50 network as encoder in my model. I tried to options: use encoder without changing weights and use encoder using pretrained weights as initial. I had better results of reconstructing training weights of ResNet, but it still cannot outperform my basic CAE with 3 conv layer in encoder and 3 upsample + conv2d layers in decoder. Can you help me with the explanation of such effect? I am training my network on different domains of OfficeHome dataset. Here is my arcitecture:
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels):
super(ConvBlock, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)
self.relu = nn.LeakyReLU(inplace=True)
self.norm = nn.BatchNorm2d(out_channels)
def forward(self, x):
x = self.conv(x)
x = self.relu(x)
x = self.norm(x)
return x
class ResnetConvAutoencoder(nn.Module):
def __init__(self):
super(ResnetConvAutoencoder, self).__init__()
resnet = models.resnet50(pretrained = True, progress = True)
# for param in resnet.parameters():
# param.requires_grad = False
self.encoder = torch.nn.Sequential(*(list(resnet.children())[:-2])) #output is 2048 x 8 x 8
del resnet
self.decoder = nn.Sequential(
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(2048, 512),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(512, 256),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(256, 128),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(128, 16),
nn.Upsample(scale_factor=2, mode='bilinear'),
ConvBlock(16, 3),
nn.Sigmoid()
) # output is 3 x 256 x 256 (as input)
self.encoder.cuda(2) # trainig on several GPUs
self.decoder.cuda(1)
def forward(self, x):
x = x.cuda(2)
x = self.encoder(x).cuda(1)
x = self.decoder(x).cuda(1)
return x
And some images examples of my simple CAE and CAE with ResNet encoder
Loss on train and test in CAE with ResNet-50 is about 0.03 and do not go lower. Loss in my CAE is lower than 0.015 (MSE).

How to reshape last layer of pytorch CNN model while doing transfer learning

Actually i am trying to replicate keras structure to pytorch.(new in pytorch).
Here is keras architecture
base_model = InceptionV3(weights='imagenet', include_top=False)
x = base_model.output
x = Dense(512, activation='relu')(x)
predictions = Dense(49*6,activation='sigmoid')(x)
reshape=Reshape((49,6))(predictions)
model = Model(inputs=base_model.input, outputs=reshape)
for layer in base_model.layers:
layer.trainable = False
I want to reshape last layer of my netwrok. I have implemented transfer learning.
model = models.inception_v3(pretrained=True)
for param in model.parameters():
param.requires_grad = False
num_ftrs = model.fc.in_features
I believe if i can attach last layer of resnet with my following architecture, problem can be solved. But i dont know how i can attach them
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.fc1 = nn.Linear(num_ftrs, 512)
self.fc2 = nn.Linear(512, 49*6)
def forward(self, x):
print (x.shape)
x = x.view(-1,num_ftrs)
#print (x.shape)
x = F.relu(self.fc1(x))
x = self.fc2(x)
x=torch.sigmoid(x.view(49,6))
return x
Any idea, how this problem can be resolved

Resources