Metrics mismatch between BertForSequenceClassification Class and my custom Bert Classification - pytorch

I implemented my custom Bert Binary Classification Model class, by adding a classifier layer on top of Bert Model (attached below). However, the accuracy/metrics are significantly different when I train with the official BertForSequenceClassification model, which makes me wonder if I am missing somehting in my class.
Few Doubts I have:
While loading the official BertForSequenceClassification from_pretrained are the classifiers weight initialized as well from pretrained model or they are randomly initialized? Because in my custom class they are randomly initialized.
class MyCustomBertClassification(nn.Module):
def __init__(self, encoder='bert-base-uncased',
num_labels,
hidden_dropout_prob):
super(MyCustomBertClassification, self).__init__()
self.config = AutoConfig.from_pretrained(encoder)
self.encoder = AutoModel.from_config(self.config)
self.dropout = nn.Dropout(hidden_dropout_prob)
self.classifier = nn.Linear(self.config.hidden_size, num_labels)
def forward(self, input_sent):
outputs = self.encoder(input_ids=input_sent['input_ids'],
attention_mask=input_sent['attention_mask'],
token_type_ids=input_sent['token_type_ids'],
return_dict=True)
pooled_output = self.dropout(outputs[1])
# for both tasks
logits = self.classifier(pooled_output)
return logits

Each model tells you via a warning message which layers are randomly initialized when you use the method from_pretrained:
from transformers import BertForSequenceClassification
b = BertForSequenceClassification.from_pretrained('bert-base-uncased')
Output:
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
The difference between your implementation and the BertForSequenceClassification is that you do not use any pretrained weights at all. The method from_config does not load the pretrained weights from a state_dict:
import torch
from transformers import AutoModelForSequenceClassification, AutoConfig
b2 = AutoModelForSequenceClassification.from_config(AutoConfig.from_pretrained('bert-base-uncased'))
b3 = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
print("Does from_config provides pretrained weights: {}".format(torch.equal(b.bert.embeddings.word_embeddings.weight, b2.base_model.embeddings.word_embeddings.weight)))
print("Does from_pretrained provides pretrained weights: {}".format(torch.equal(b.bert.embeddings.word_embeddings.weight, b3.base_model.embeddings.word_embeddings.weight)))
Output:
Does from_config provides pretrained weights: False
Does from_pretrained provides pretrained weights: True
Therefore you probably want to change your class to:
class MyCustomBertClassification(nn.Module):
def __init__(self, encoder='bert-base-uncased',
num_labels=2,
hidden_dropout_prob=0.1):
super(MyCustomBertClassification, self).__init__()
self.config = AutoConfig.from_pretrained(encoder)
self.encoder = AutoModel.from_pretrained(encoder)
self.dropout = nn.Dropout(hidden_dropout_prob)
self.classifier = nn.Linear(self.config.hidden_size, num_labels)
def forward(self, input_sent):
outputs = self.encoder(input_ids=input_sent['input_ids'],
attention_mask=input_sent['attention_mask'],
token_type_ids=input_sent['token_type_ids'],
return_dict=True)
pooled_output = self.dropout(outputs[1])
# for both tasks
logits = self.classifier(pooled_output)
return logits
myB = MyCustomBertClassification()
print(torch.equal(b.bert.embeddings.word_embeddings.weight, myB.encoder.embeddings.word_embeddings.weight))
Output:
True

Related

How can I built this self-paced learning loss function and use it in keras?

I'm trying to implemment self-paced learning (SPL) in my keras model. I was focused on this work, where self-paced learning is also applied to a deep learning model but the implemmentation is done using pytorch. I'm having some trouble converting the following pythoch code to keras.
Creating loss function
import torch
from torch import Tensor
import torch.nn as nn
class SPLLoss(nn.NLLLoss):
def __init__(self, *args, n_samples=0, **kwargs):
super(SPLLoss, self).__init__(*args, **kwargs)
self.threshold = 0.1
self.growing_factor = 1.3
self.v = torch.zeros(n_samples).int()
def forward(self, input: Tensor, target: Tensor, index: Tensor) -> Tensor:
super_loss = nn.functional.nll_loss(input, target, reduction="none")
v = self.spl_loss(super_loss)
self.v[index] = v
return (super_loss * v).mean()
def increase_threshold(self):
self.threshold *= self.growing_factor
def spl_loss(self, super_loss):
v = super_loss < self.threshold
return v.int()
Applying loss function to train
import torch.optim as optim
from model import Model
from dataset import get_dataloader
from loss import SPLLoss
def train():
model = Model(2, 2, 2, 0)
dataloader = get_dataloader()
criterion = SPLLoss(n_samples=len(dataloader.dataset))
optimizer = optim.Adam(model.parameters())
for epoch in range(10):
for index, data, target in dataloader:
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target, index)
loss.backward()
optimizer.step()
criterion.increase_threshold()
return model
1. How can I define my loss function(SPL_loss) accordingly?
2. How can I use model.fit() and increase the threshold between trains in keras since I'm using this data generator to load the data?

keras model with one connection per input node

I would like to create a sequential model in keras with one hidden layer with as many nodes as there are input nodes. Each input node should be connected to only one of the hidden nodes. All nodes in the hidden layer should be connected to a single output node: as in this image
I would like to be able to specify the activation function of the hidden layer.
Is it possible to achieve that with a Sequential() model in keras?
Here is a custom layer where you can do everything you want:
import keras
import tensorflow as tf
from keras.layers import *
from keras import Sequential
import numpy as np
tf.set_random_seed(10)
class MyDenseLayer(keras.layers.Layer):
def __init__(self):
super(MyDenseLayer, self).__init__()
def parametric_relu(self, _x):
# some more or less complicated activation
# with own weight
pos = tf.nn.relu(_x)
neg = self.alphas * (_x - abs(_x)) * 0.5
return pos + neg
def build(self, input_shape):
# main weight
self.kernel = self.add_weight("kernel",
shape=[int(input_shape[-1]),],
initializer=tf.random_normal_initializer())
# any additional weights here
self.alphas = self.add_weight('alpha', shape=[int(input_shape[-1]),],
initializer=tf.constant_initializer(0.0),
dtype=tf.float32)
self.size = int(input_shape[-1])
def call(self, input):
linear = tf.matmul(input, self.kernel*tf.eye(self.size))
nonlinear = self.parametric_relu(linear)
return nonlinear
model = Sequential()
model.add(MyDenseLayer())
model.build((None, 4))
print(model.summary())
x = np.ones((5,4))
print(model.predict(x))

Visualize the output of Vgg16 model by TSNE plot?

I need to visualize the output of Vgg16 model which classify 14 different classes.
I load the trained model and I did replace the classifier layer with the identity() layer but it doesn't categorize the output.
Here is the snippet:
the number of samples here is 1000 images.
epoch = 800
PATH = 'vgg16_epoch{}.pth'.format(epoch)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
class Identity(nn.Module):
def __init__(self):
super(Identity, self).__init__()
def forward(self, x):
return x
model.classifier._modules['6'] = Identity()
model.eval()
logits_list = numpy.empty((0,4096))
targets = []
with torch.no_grad():
for step, (t_image, target, classess, image_path) in enumerate(test_loader):
t_image = t_image.cuda()
target = target.cuda()
target = target.data.cpu().numpy()
targets.append(target)
logits = model(t_image)
print(logits.shape)
logits = logits.data.cpu().numpy()
print(logits.shape)
logits_list = numpy.append(logits_list, logits, axis=0)
print(logits_list.shape)
tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=1000)
tsne_results = tsne.fit_transform(logits_list)
target_ids = range(len(targets))
plt.scatter(tsne_results[:,0],tsne_results[:,1],c = target_ids ,cmap=plt.cm.get_cmap("jet", 14))
plt.colorbar(ticks=range(14))
plt.legend()
plt.show()
here is what this script has been produced: I am not sure why I have all colors for each cluster!
The VGG16 outputs over 25k features to the classifier. I believe it's too much to t-SNE. It's a good idea to include a new nn.Linear layer to reduce this number. So, t-SNE may work better. In addition, I'd recommend you two different ways to get the features from the model:
The best way to get it regardless of the model is by using the register_forward_hook method. You may find a notebook here with an example.
If you don't want to use the register, I'd suggest this one. After loading your model, you may use the following class to extract the features:
class FeatNet (nn.Module):
def __init__(self, vgg):
super(FeatNet, self).__init__()
self.features = nn.Sequential(*list(vgg.children())[:-1]))
def forward(self, img):
return self.features(img)
Now, you just need to call FeatNet(img) to get the features.
To include the feature reducer, as I suggested before, you need to retrain your model doing something like:
class FeatNet (nn.Module):
def __init__(self, vgg):
super(FeatNet, self).__init__()
self.features = nn.Sequential(*list(vgg.children())[:-1]))
self.feat_reducer = nn.Sequential(
nn.Linear(25088, 1024),
nn.BatchNorm1d(1024),
nn.ReLU()
)
self.classifier = nn.Linear(1024, 14)
def forward(self, img):
x = self.features(img)
x_r = self.feat_reducer(x)
return self.classifier(x_r)
Then, you can run your model returning x_r, that is, the reduced features. As I told you, 25k features are too much for t-SNE. Another method to reduce this number is by using PCA instead of nn.Linear. In this case, you send the 25k features to PCA and then train t-SNE using the PCA's output. I prefer using nn.Linear, but you need to test to check which one you get a better result.

Is there a way to save a Keras model build in tensorflow 2.0 from Model Sub classing API?

Is there a way to save the entire model build using tf.keras Model subclassing API after the training is done? I know we can use save_weights to save the weights only, but is there a way to save the whole model so that I may use it for prediction later when I do not have the code available?
class MyModel(tf.keras.Model):
def __init__(self, num_classes=10):
super(MyModel, self).__init__(name='my_model')
self.num_classes = num_classes
# Define your layers here.
self.dense_1 = layers.Dense(32, activation='relu')
self.dense_2 = layers.Dense(num_classes, activation='sigmoid')
def call(self, inputs):
# Define your forward pass here,
# using layers you previously defined (in `__init__`).
x = self.dense_1(inputs)
return self.dense_2(x)
model = MyModel(num_classes=10)
# The compile step specifies the training configuration.
model.compile(optimizer=tf.keras.optimizers.RMSprop(0.001),
loss='categorical_crossentropy',
metrics=['accuracy'])
model.fit(data, labels, batch_size=32, epochs=5)
You can use the following steps for saving model after training, loading and inference:
Save Model after training
model.save(filepath="model")
# OR
tf.keras.models.save_model(model, filepath="model_")
Load Saved Model
loaded_model = tf.keras.models.load_model(filepath="model_")
Prediction using Loaded model
result = loaded_model.predict(test_db)

How to add BatchNormalization loss to gradient calculation in tensorflow 2.0 using keras subclass API

Using the keras subclass API it is easy enough to add a a batch normalization layer however the layer.losses list always appears empty. What is the correct method of including in the train loss when doing tape.gradient(loss, lossmodel.trainable_variables) where lossmodel is some separate keras subclass model defining a more complicated loss function that must include the gradient losses?
For example, this is minimal model with ONLY the batch norm layer. It has no loss AFAIK
class M(tf.keras.Model):
def __init__(self, axis):
super().__init__()
self.layer = tf.keras.layers.BatchNormalization(axis=axis, scale=False, center=True, virtual_batch_size=1, input_shape=(6,))
def call(self, x):
out = self.layer(x)
return out
m = M(1)
In [77]: m.layer.losses
Out[77]: []

Resources