How to print the output weights for the output layer in BERT? - nlp

I would like to print the output vector/tensor in BERT an wasn't sure how to do it. I've been using the following example to walk myself through it:
https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX
Its a simple classification problem, but I want to be able to get the output vector before we classify the training examples. Can someone point to where in the code I can do this and how?

Do you want the weights to the output layer or the logits? I think you want the logits, it is more work but better in the long run to subclass so you can play with it yourself. Here part of subclass I did where I wanted dropout and more control. I'll just include it here where you can access all the parts of the model
class MyBert(BertPreTrainedModel):
def __init__(self, config, dropout_prob):
super().__init__(config)
self.num_labels = 2
self.bert = BertModel(config)
self.dropout = torch.nn.Dropout(dropout_prob)
self.classifier = torch.nn.Linear(config.hidden_size, self.num_labels)
self.init_weights()
def forward(self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,):
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # (loss), logits, (hidden_states), (attentions)

Related

SequenceClassifierOutput has generator as loss instead of a tensor

I'm doing Distillation from a Roberta with an Adapter, I'm following this tutorial
and in the function distill_roberta_weights() I just change teacher_model.config.to_dict()
to student.load_state_dict(teacher.state_dict(), strict=False), so the student model has the adapter too.
But when I am training the distillation using the
DistillationTrainer
from here
I get the following error
Do you have any idea of what is the problem?
The student_output has a loss generator instead the tensor, the part of the cross entropy does not have any problem as it uses the logits from the outputs.
EDIT:
I am adding more information
def distill_weights(teacher, student):
"""
Recursively copies the weights of the (teacher) to the (student).
This function is meant to be first called on a RobertaFor... model, but is then called on every children of that model recursively.
The only part that's not fully copied is the encoder, of which only half is copied.
"""
# If the part is an entire RoBERTa model or a RobertaFor..., unpack and iterate
if isinstance(teacher, RobertaModel) or type(teacher).__name__.startswith('RobertaFor'):
for teacher_part, student_part in zip(teacher.children(), student.children()):
distill_weights(teacher_part, student_part)
# Else if the part is an encoder, copy one out of every layer
elif isinstance(teacher, RobertaEncoder):
teacher_encoding_layers = [layer for layer in next(teacher.children())]
student_encoding_layers = [layer for layer in next(student.children())]
for i in range(len(student_encoding_layers)):
student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2*i].state_dict())
# Else the part is a head or something else, copy the state_dict
else:
student.load_state_dict(teacher.state_dict(), strict=False)
def distill_roberta_based(teacher_model):
"""
Distilates a RoBERTa (teacher_model) like would DistilBERT for a BERT model.
The student model has the same configuration, except for the number of hidden layers, which is // by 2.
The student layers are initilized by copying one out of two layers of the teacher, starting with layer 0.
The head of the teacher is also copied.
"""
# Set student configuration
configuration = teacher_model.config.to_dict()
configuration['num_hidden_layers'] //= 2
configuration = RobertaConfig.from_dict(configuration)
# create student model
student_model = type(teacher_model)(configuration)
distill_weights(teacher=teacher_model, student=student_model)
return student_model
#function for train the Distillated model
class DistillationTrainer(Trainer):
def __init__(self, *args, teacher_model=None, **kwargs):
super().__init__(*args, **kwargs)
self.teacher = teacher_model
# place teacher on same device as student
self._move_model_to_device(self.teacher,self.model.device)
self.teacher.eval()
def compute_loss(self, model, inputs, return_outputs = False) :
"""
The distillation loss for distilating a BERT-like model.
The loss takes the (teacher_logits), (student_logits) and (labels) for various losses.
The (temperature) can be given, otherwise it's set to 1 by default.
"""
outputs_student = model(**inputs)
print(outputs_student)
student_loss = outputs_student.loss
# compute teacher output
with torch.no_grad():
outputs_teacher = self.teacher(**inputs)
# assert size
assert outputs_student.logits.size() == outputs_teacher.logits.size()
# Classification loss (problem-specific loss)
loss_function = CrossEntropyLoss()
# Temperature and sotfmax
student_logits = F.softmax (outputs_student.logits / self.args.temperature, dim=-1)
teacher_logits = F.softmax (outputs_teacher.logits / self.args.temperature, dim=-1)
loss_logits = loss_function(student_logits, teacher_logits)
# Return weighted student loss
loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
return (loss, outputs_student) if return_outputs else loss
#create the student
student_model_adapter = distill_roberta_based(teacher_model)
#activate adapter
student_model_adapter.set_active_adapters('parallel')
student_model_adapter.train_adapter('parallel')
trainer = DistillationTrainer(
student_model_adapter,
training_args,
teacher_model=teacher_model,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.args._n_gpu = 4
So, the desired output of outputs_student should be like
SequenceClassifierOutput(loss=tensor([0.6899, 0.6902, 0.6926, 0.6913, 0.6906, 0.6904, 0.6922, 0.6917],
device='cuda:0', grad_fn=<GatherBackward>), logits=tensor([[-1.2512e-03, -9.7885e-03],
[ 6.2714e-03, -5.7755e-03],.....])
But instead the output is
SequenceClassifierOutput(loss=<generator object gather.<locals>.gather_map.<locals>.<genexpr> at 0x7f5bb4fbe9d0>, logits=tensor([[-0.0150, 0.0075],
[-0.0122, 0.0181],...

A language model with only one embedding layer in both encode and decode only predict <eos>

I'm trying to make the model predict a word from a sentence using pretrained Huggingface's BERT as feature extractor. The model look like this
class BertAutoEncoder(nn.Module):
def __init__(self, vocab_size):
super().__init__()
decoder_layer = nn.TransformerDecoderLayer(768, 2, 1024, dropout=0.1)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, 2)
self.fc = nn.Linear(768, vocab_size)
def forward(self, memory, embedded_word):
output = self.transformer_decoder(embedded_word, memory)
output = self.fc(output)
return output
And when train/evaluate I call the model like this
bert = BertModel.from_pretrained('bert-base-uncased')
bert.requires_grad_(False)
...
memory = bert(**src).last_hidden_state.transpose(0, 1)
embeded_word = bert.embeddings(trg.data['input_ids'][:, :-1], token_type_ids=trg.data['token_type_ids'][:, :-1]).transpose(0, 1)
output = model(memory, embeded_word)
The loss reduced nicely but turned out the model only predict <eos> token.
I tried train the model with 1 batch of 32 samples and it did work when loss reduced pass 8e-6 but when I trained it with all data the loss could go way beyond that but none of the saved models work. Even the one with eval or train loss around 4e-6 - 8e-6.
Surprisingly the model would work if I use a separate decoder's Embedding like this
class BertAutoEncoderOld(nn.Module):
def __init__(self, vocab_size):
super().__init__()
decoder_layer = nn.TransformerDecoderLayer(768, 2, 1024, dropout=0.1)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, 2)
self.decoder = nn.Embedding(vocab_size, 768)
self.pos_decoder = PositionalEncoding(768, 0.5)
self.fc = nn.Linear(768, vocab_size)
def forward(self, memory, word):
tgt = self.decoder(word.data['input_ids'][:, :-1].transpose(0, 1))
tgt = self.pos_decoder(tgt)
output = self.transformer_decoder(tgt, memory)
output = self.fc(output)
return output
But I was asked to make it work with one Embedding and I have no idea how.
I tried
Reduce/increase batch from 32 to 8-64
Also tried 2 and 1024 batch size
Remove <eos> token and change it's attention mask to 0
But none of those work.
What did I do wrong and how to fix it?
Thanks
Edit per #emily qeustion
I change the data itself in collate function
text.data['attention_mask'][text.data['input_ids'] == 102] = 0
text.data['input_ids'][text.data['input_ids'] == 102] = 0
word.data['attention_mask'][word.data['input_ids'] == 102] = 0
word.data['input_ids'][word.data['input_ids'] == 102] = 0
It only used in Bert though.

LSTM/RNN in pytorch The relation between forward method and training model

I'm still fairly new to neural networks, so sorry on beforehand for any ambiguities to the following.
In a "standard" LSTM implementation for language task, we have the following (sorry for the very rough sketches):
class LSTM(nn.Module):
def __init__(*args):
...
def forward(self, input, states):
lstn_in = self.model['embed'](input)
lstm_out, hidden = self.model['lstm'](lstm_in,states)
return lstm_out, hidden
Later on, we call upon this model in the training step:
def train(*args):
for epoch in range(epochs):
....
*init_zero_states
...
out, states = model(input, states)
...
return model
Let's just say, that I have 3 sentences as input:
sents = [[The, sun, is, shiny],
[The, beach, was, very, windy],
[Computer, broke, down, today]]
model = train(LSTM, sents)
All words in all sentences gets converted to embeddings and loaded into the model.
Now the question:
Does the self.model['lstm'] iterate though all words from all articles and makes one output after every word? or every sentence?
How does the model make distinction between the 3 sentences, such as after getting "The", "sun", "is", "shiny", does something (such as the states) in the 'lstm' reset and begin anew?
The "out" in the train step after out, states = model(input, states) is the output after running all 3 sentences and hence the combined "information" from all 3 sentences?
Thanks!
when using LSTMs in Pytorch you usually use the nn.LSTM function. Here is a quick example and then an explanation what happens inside:
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.embedder = nn.Embedding(voab_size, embed_size)
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
x = self.embedder(x)
# every time you pass a new sentence into the model you need to create
# a new hidden-state (the LSTM requires, unlike RNNs, two hidden-states in a tuple)
hidden = (torch.zeros(num_layers, batch_size, hidden_size), torch.zeros(num_layers, batch_size, hidden_size))
x, hidden = self.lstm(x, hidden)
# x contains the output states of every timestep,
# for classifiction we mostly just want the last one
x = x[:, -1]
x = self.fc(x)
x = self.softmax(x)
return x
So, when taking a look at the nn.LSTM function, you see all N embedded words are passed into it at once and you get as output all N outputs (one from every timestep). That means inside of the lstm function, it iterates over all words in the sentence embeddings. We just dont see that in the code. It also returns the hiddenstate of every timestep but you dont have to use that further. In most cases you can just ignore that.
As pseudo code:
def lstm(x):
hiddenstates = init_with_zeros()
outputs, hiddenstates = [], []
for e in x:
output, hiddenstate = neuralnet(e, hiddenstate)
outputs.append(output)
hiddenstates.append(hiddenstate)
return outputs, hiddenstates
sentence = ["the", "sun", "is", "shiny"]
sentence = embedding(sentence)
outputs, hiddenstates = lstm(sentence)

Visualize the output of Vgg16 model by TSNE plot?

I need to visualize the output of Vgg16 model which classify 14 different classes.
I load the trained model and I did replace the classifier layer with the identity() layer but it doesn't categorize the output.
Here is the snippet:
the number of samples here is 1000 images.
epoch = 800
PATH = 'vgg16_epoch{}.pth'.format(epoch)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
class Identity(nn.Module):
def __init__(self):
super(Identity, self).__init__()
def forward(self, x):
return x
model.classifier._modules['6'] = Identity()
model.eval()
logits_list = numpy.empty((0,4096))
targets = []
with torch.no_grad():
for step, (t_image, target, classess, image_path) in enumerate(test_loader):
t_image = t_image.cuda()
target = target.cuda()
target = target.data.cpu().numpy()
targets.append(target)
logits = model(t_image)
print(logits.shape)
logits = logits.data.cpu().numpy()
print(logits.shape)
logits_list = numpy.append(logits_list, logits, axis=0)
print(logits_list.shape)
tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=1000)
tsne_results = tsne.fit_transform(logits_list)
target_ids = range(len(targets))
plt.scatter(tsne_results[:,0],tsne_results[:,1],c = target_ids ,cmap=plt.cm.get_cmap("jet", 14))
plt.colorbar(ticks=range(14))
plt.legend()
plt.show()
here is what this script has been produced: I am not sure why I have all colors for each cluster!
The VGG16 outputs over 25k features to the classifier. I believe it's too much to t-SNE. It's a good idea to include a new nn.Linear layer to reduce this number. So, t-SNE may work better. In addition, I'd recommend you two different ways to get the features from the model:
The best way to get it regardless of the model is by using the register_forward_hook method. You may find a notebook here with an example.
If you don't want to use the register, I'd suggest this one. After loading your model, you may use the following class to extract the features:
class FeatNet (nn.Module):
def __init__(self, vgg):
super(FeatNet, self).__init__()
self.features = nn.Sequential(*list(vgg.children())[:-1]))
def forward(self, img):
return self.features(img)
Now, you just need to call FeatNet(img) to get the features.
To include the feature reducer, as I suggested before, you need to retrain your model doing something like:
class FeatNet (nn.Module):
def __init__(self, vgg):
super(FeatNet, self).__init__()
self.features = nn.Sequential(*list(vgg.children())[:-1]))
self.feat_reducer = nn.Sequential(
nn.Linear(25088, 1024),
nn.BatchNorm1d(1024),
nn.ReLU()
)
self.classifier = nn.Linear(1024, 14)
def forward(self, img):
x = self.features(img)
x_r = self.feat_reducer(x)
return self.classifier(x_r)
Then, you can run your model returning x_r, that is, the reduced features. As I told you, 25k features are too much for t-SNE. Another method to reduce this number is by using PCA instead of nn.Linear. In this case, you send the 25k features to PCA and then train t-SNE using the PCA's output. I prefer using nn.Linear, but you need to test to check which one you get a better result.

How can I use LSTM in pytorch for classification?

My code is as below:
class Mymodel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers, batch_size):
super(Discriminator, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.batch_size = batch_size
self.lstm = nn.LSTM(input_size, hidden_size)
self.proj = nn.Linear(hidden_size, output_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size)),
Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size)))
def forward(self, x):
lstm_out, self.hidden = self.lstm(x, self.hidden)
output = self.proj(lstm_out)
result = F.sigmoid(output)
return result
I want to use LSTM to classify a sentence to good (1) or bad (0). Using this code, I get the result which is time_step * batch_size * 1 but not 0 or 1. How to edit the code in order to get the classification result?
Theory:
Recall that an LSTM outputs a vector for every input in the series. You are using sentences, which are a series of words (probably converted to indices and then embedded as vectors). This code from the LSTM PyTorch tutorial makes clear exactly what I mean (***emphasis mine):
lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3
inputs = [autograd.Variable(torch.randn((1, 3)))
for _ in range(5)] # make a sequence of length 5
# initialize the hidden state.
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
autograd.Variable(torch.randn((1, 1, 3))))
for i in inputs:
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden state.
out, hidden = lstm(i.view(1, 1, -1), hidden)
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# *** (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
torch.randn((1, 1, 3)))) # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)
One more time: compare the last slice of "out" with "hidden" below, they are the same. Why? Well...
If you're familiar with LSTM's, I'd recommend the PyTorch LSTM docs at this point. Under the output section, notice h_t is output at every t.
Now if you aren't used to LSTM-style equations, take a look at Chris Olah's LSTM blog post. Scroll down to the diagram of the unrolled network:
As you feed your sentence in word-by-word (x_i-by-x_i+1), you get an output from each timestep. You want to interpret the entire sentence to classify it. So you must wait until the LSTM has seen all the words. That is, you need to take h_t where t is the number of words in your sentence.
Code:
Here's a coding reference. I'm not going to copy-paste the entire thing, just the relevant parts. The magic happens at self.hidden2label(lstm_out[-1])
class LSTMClassifier(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
...
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.hidden2label = nn.Linear(hidden_dim, label_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim)),
autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim)))
def forward(self, sentence):
embeds = self.word_embeddings(sentence)
x = embeds.view(len(sentence), self.batch_size , -1)
lstm_out, self.hidden = self.lstm(x, self.hidden)
y = self.hidden2label(lstm_out[-1])
log_probs = F.log_softmax(y)
return log_probs
The main problem you need to figure out is the in which dim place you should put your batch size when you prepare your data. As far as I know, if you didn't set it in your nn.LSTM() init function, it will automatically assume that the second dim is your batch size, which is quite different compared to other DNN framework. Maybe you can try:
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
like this to ask your model to treat your first dim as the batch dim.
As a last layer you have to have a linear layer for however many classes you want i.e 10 if you are doing digit classification as in MNIST . For your case since you are doing a yes/no (1/0) classification you have two lablels/ classes so you linear layer has two classes. I suggest adding a linear layer as
nn.Linear ( feature_size_from_previous_layer , 2)
and then train the model using a cross-entropy loss.
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

Resources