One single-batch training on Huggingface Bert model "ruins" the model - pytorch

For some reason, I need to do further (2nd-stage) pre-training on Huggingface Bert model, and I find my training outcome is very bad.
After debugging for hours, surprisingly, I find even training one single batch after loading the base model, will cause the model to predict a very bad choice when I ask it to unmask some test sentences. I boil down my code to the minimal reproducible version here:
import torch
from transformers import AdamW, BertTokenizer
from transformers import BertForPreTraining
MSK_CODE = 103
CE_IGN_IDX = -100 # CrossEntropyLoss ignore index value
def sanity_check(tokenizer, inputs):
print('Label:', inputs["next_sentence_label"][0])
def test(tokenizer, model, topk=3):
test_data = "She needs to [MASK] that [MASK] has only ten minutes."
print('\n \033[92m', test_data, '\033[0m')
test_inputs = tokenizer([test_data],
padding=True, truncation=True, return_tensors="pt")
def classifier_hook(module, inputs, outputs):
unmask_scores, seq_rel_scores = outputs
token_ids = test_inputs['input_ids'][0]
masked_idx = (
token_ids == torch.tensor([MSK_CODE])
scores = unmask_scores[0][masked_idx]
cands = torch.argsort(scores, dim=1, descending=True)
for i, mask_cands in enumerate(cands):
top_cands = mask_cands[:topk].detach().cpu()
print(f'MASK[{i}] top candidates:', end=" ")
classifier = model.cls
hook = classifier.register_forward_hook(classifier_hook)
# load model
model = BertForPreTraining.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
optimizer = AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
# first test
test(tokenizer, model)
# our single-iteration inputs
# [CLS] 1 2 3 4 5 6 [SEP] 8 9 10 11 12 [SEP]
pair = [['the man went to the store', 'penguins are flightless birds']]
relation_label = 1
# construct inputs
inputs = tokenizer(pair, padding=True, truncation=True, return_tensors="pt")
inputs["next_sentence_label"] = torch.tensor([relation_label])
mask_labels = torch.full(inputs["input_ids"].shape, fill_value=CE_IGN_IDX)
inputs["labels"] = mask_labels
# mask two words
inputs["input_ids"][0][4] = MSK_CODE
inputs["input_ids"][0][9] = MSK_CODE
mask_labels[0][4] = tokenizer.convert_tokens_to_ids('to')
mask_labels[0][9] = tokenizer.convert_tokens_to_ids('are')
# train for one single iteration
sanity_check(tokenizer, inputs)
outputs = model(**inputs)
loss = outputs.loss
# second test
test(tokenizer, model)
Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
She needs to [MASK] that [MASK] has only ten minutes.
MASK[0] top candidates: ['know', 'understand', 'remember']
MASK[1] top candidates: ['she', 'he', 'it']
[CLS] the man went [MASK] the store [SEP] penguins [MASK] flightless birds [SEP]
['[UNK]', '[UNK]', '[UNK]', '[UNK]', 'to', '[UNK]', '[UNK]', '[UNK]', '[UNK]', 'are', '[UNK]', '[UNK]', '[UNK]', '[UNK]']
Label: tensor(1)
She needs to [MASK] that [MASK] has only ten minutes.
MASK[0] top candidates: ['are', 'know', 'be']
MASK[1] top candidates: ['are', 'is', 'she']
Basically, I use She needs to [MASK] that [MASK] has only ten minutes. as a test sentence to test the unmasking.
As you may see, at the beginning when I tested the base model, it works perfectly.
However, after I feed the pre-train model with a single pair of training batch:
[CLS] the man went [MASK] the store [SEP] penguins [MASK] flightless birds [SEP]
The updated model no longer makes sense, it unmasks She needs to [MASK] that [MASK] has only ten minutes. into She needs to [are] that [are] has only ten minutes.
I can think of two possibilities why this happens...
Bert model is extremely sensitive to training batch size, a small batch causes unacceptable bias.
there is a bug in the code?
So, any idea?


Pytorch BCE loss not decreasing for word sense disambiguation task

I am performing word sense disambiguation and have created my own vocabulary of the top 300k most common English words. My model is very simple where each word in the sentences (their respective index value) is passed through an embedding layer which embeds the word and average the resulting embedding. The averaged embedding is then sent through a linear layer, as shown in the model below.
class TestingClassifier(nn.Module):
def __init__(self, vocabSize, features, embeddingDim):
super(TestingClassifier, self).__init__()
self.embeddings = nn.Embedding(vocabSize, embeddingDim)
self.linear = nn.Linear(features, 2)
self.sigmoid = nn.Sigmoid()
def forward(self, inputs):
embeds = self.embeddings(inputs)
avged = torch.mean(embeds, dim=-1)
output = self.linear(avged)
output = self.sigmoid(output)
return output
I am running BCELoss as loss function and SGD as optimizer. My problem is that my loss barely decreases as training goes on, almost as if it converges with a very high loss. I have tried different learning rates (0.0001, 0.001, 0.01 and 0.1) but I get the same issue.
My training function is as follows:
def train_model(model,
earlyStop = False,
maxPatience = 1
validationAcc = []
patienceCounter = 0
stopTraining = False
# Train network
for epoch in range(epochs):
losses = []
for inputs, labels in tqdm(trainDataLoader, position=0, leave=True):
# Predict and calculate loss
prediction = model(inputs)
loss = lossFunction(prediction, labels)
# Backward propagation
# Readjust weights
print(sum(losses) / len(losses))
curValidAcc = check_accuracy(validDataLoader, model, isRnnModel) # Check accuracy on validation set
curTrainAcc = check_accuracy(trainDataLoader, model, isRnnModel)
print("Epoch", epoch + 1, "Training accuracy", curTrainAcc, "Validation accuracy:", curValidAcc)
# Control early stopping
if(patienceCounter == 0):
if(len(validationAcc) > 0 and curValidAcc < validationAcc[-1]):
benchmark = validationAcc[-1]
patienceCounter += 1
print("Patience counter", patienceCounter)
elif(patienceCounter == maxPatience):
print("EARLY STOP. Patience level:", patienceCounter)
stopTraining = True
if(curValidAcc < benchmark):
patienceCounter += 1
print("Patience counter", patienceCounter)
benchmark = curValidAcc
patienceCounter = 0
Batch size is 32 (training set contains 8000 rows), vocabulary size is 300k, embedding dimension is 24. I have tried adding more linear layers to the network, but it makes no difference. The prediction accuracy on the training and validation sets stays at around 50% (which is horrible) even after many epochs of training. Any help is much appreciated!

BERT document embedding

I am trying to do document embedding using BERT. The code I use is a combination of two sources. I use BERT Document Classification Tutorial with Code, and BERT Word Embeddings Tutorial. Below is the code, I feed the first 510 tokens of each document to the BERT model. Finally, I apply K-means clustering to these embeddings, but the members of each cluster are TOTALLY irrelevant. I am wondering how this is possible. Maybe something is wrong with my code. I would appreciate if you take a look at my code and tell if there is something wrong with it. I use Google colab to run this code.
# text_to_embedding function
import torch
from keras.preprocessing.sequence import pad_sequences
def text_to_embedding(tokenizer, model, in_text):
Uses the provided BERT 'model' and 'tokenizer' to generate a vector
representation of the input string, 'in_text'.
Returns the vector stored as a numpy ndarray.
# ===========================
# STEP 1: Tokenization
# ===========================
MAX_LEN = 510
# 'encode' will:
# (1) Tokenize the sentence
# (2) Prepend the '[CLS]' token to the start.
# (3) Append the '[SEP]' token to the end.
# (4) Map tokens to their IDs.
input_ids = tokenizer.encode(
in_text, # sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = MAX_LEN, # Truncate all sentences.
#return_tensors = 'pt' # Return pytorch tensors.
# Pad our input tokens. Truncation was handled above by the 'encode'
# function, which also makes sure that the '[SEP]' token is placed at the
# end *after* truncating.
# Note: 'pad_sequences' expects a list of lists, but we only have one
# piece of text, so we surround 'input_ids' with an extra set of brackets.
results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
value=0, truncating="post", padding="post")
# Remove the outer list.
input_ids = results[0]
# Create attention masks.
attn_mask = [int(i > 0) for i in input_ids]
# Cast to tensors.
input_ids = torch.tensor(input_ids)
attn_mask = torch.tensor(attn_mask)
# Add an extra dimension for the "batch" (even though there is only one
# input in this batch)
input_ids = input_ids.unsqueeze(0)
attn_mask = attn_mask.unsqueeze(0)
# ===========================
# STEP 1: Tokenization
# ===========================
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
# Copy the inputs to the GPU
input_ids =
attn_mask =
# telling the model not to build the backward graph will make this
# a little quicker.
with torch.no_grad():
# Forward pass, returns hidden states and predictions
# This will return the logits rather than the loss because we have
# not provided labels.
outputs = model(
input_ids = input_ids,
token_type_ids = None,
attention_mask = attn_mask)
hidden_states = outputs[2]
#Sentence Vectors
#To get a single vector for our entire sentence we have multiple
#application-dependent strategies, but a simple approach is to
#average the second to last hiden layer of each token producing
#a single 768 length vector.
# `hidden_states` has shape [13 x 1 x ? x 768]
# `token_vecs` is a tensor with shape [? x 768]
token_vecs = hidden_states[-2][0]
# Calculate the average of all ? token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)
# Move to the CPU and convert to numpy ndarray.
sentence_embedding = sentence_embedding.detach().cpu().numpy()
from transformers import BertTokenizer, BertModel
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
output_hidden_states = True, # Whether the model returns all hidden-states.
from transformers import BertTokenizer
# Load the BERT tokenizer.
print('Loadin BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
I don't know if it solves your problem but here's my 2 cent:
You don't have to calculate the attention mask and do the padding manually. Have a look at the documentation. Just call the tokenizer itself:
results = tokenizer(in_text, max_length=MAX_LEN, truncation=True)
input_ids = results.input_ids
attn_mask = results.attention_mask
# Cast to tensors
Instead of using the average of the second to last hidden layer, you can try the same thing with the last hidden layer; or you can use the vector represents [CLS] from the last layer

Unable to create custom dataset and dataloader using torchtext

I have questions regarding building custom dataset and iterator using torchtext. I used the following code found in this post and modified based on my case:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
text_field = Field(sequential=True, eos_token="[CLS]", tokenize=tokenizer)
label_field = Field(sequential=False, use_vocab=False)
data_fields = [("file", None),
("text", text_field),
("label", label_field)]
train, val = train_test_split(input_dt, test_size=0.1)
train.to_csv("train_output_path", index=False)
val.to_csv("val_output_path", index=False)
train, val = TabularDataset(path="path", train="train.csv", validation="val.csv",
format="csv", skip_header=True, fields=data_fields)
When it comes to text_field.build_vocab(train), I got this error: TypeError: '<' not supported between instances of 'list' and 'int'.
The only difference between my code and the post is the pre-trained word embeddings. In the post, the author used glove, which I use XLNetTokenizer from transformers package. I also searched for other posts who used the similar method, but they all used the pre-trained word embeddings, therefore they did have such an issue.
Does anyone know how to fix this issue? Many thanks!
I think as you are using a predefined tokenizer you dont't need to build vocab, instead you can follow this steps. Showing an example of how to do it using BERT tokenizer.
Sentences: it is a list of of text data
lables: is the label associated
###tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
# For every sentence...
for sent in sentences:
# `encode_plus` will:
# (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs.
# (5) Pad or truncate the sentence to `max_length`
# (6) Create attention masks for [PAD] tokens.
encoded_dict = tokenizer.encode_plus(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 100, # Pad & truncate all sentences.
pad_to_max_length = True,
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
# Add the encoded sentence to the list.
# And its attention mask (simply differentiates padding from non-padding).
# Convert the lists into tensors.
input_ids =, dim=0)
attention_masks =, dim=0)
labels = torch.tensor(labels)
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
### Not combine the input id , mask and labels and divide the dataset
from import TensorDataset, random_split
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)
# Create a 90-10 train-validation split.
# Calculate the number of samples to include in each set.
train_size = int(0.90 * len(dataset))
val_size = len(dataset) - train_size
# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
### Not you call loader of these datasets
from import DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 32
# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
train_dataset, # The training samples.
sampler = RandomSampler(train_dataset), # Select batches randomly
batch_size = batch_size # Trains with this batch size.
# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
val_dataset, # The validation samples.
sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
batch_size = batch_size # Evaluate with this batch size.

predicting Y label using CuDNNLSTM

I want to use CuDNNLSTM to predict the Y label. I have a dataset and I want to use CuDNNLSTM to predict the code. I am considering the sentences as X label and the codes as Y label.
the model is actually giving probability matrix of every class. I want to know
1. How can I predict the actual sentence code
The dataset is somewhat this kind of:
Google headquarters is in California 98873
Google pixel is a very nice phone 98873
Steve Jobs was a great man 15890
Steve Jobs has done great technology innovations 15890
Microsoft is another great giant in technology 89736
Bill Gates founded Microsoft 89736
I took help from this link:
The below code I am using predicts the probability matrix, I want to know how can it predicts the actual sentence code.
Also, can we use tfidf vectorizer?
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# This is fixed.
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?#[\]^_`{|}~', lower=True)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
X = tokenizer.texts_to_sequences(df['procedureNew'].values)
X = keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)
Y = pd.get_dummies(df['SuggestedCpt1']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
epochs = 10
batch_size = 40
history =, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0],accr[1]*100, "%"))
new_sentence = ['Pixel phone is launched by Google']
seq = tokenizer.texts_to_sequences(new_procedure)
padded = keras.preprocessing.sequence.pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['98873', '15890', '89736', '87325', '23689', '10368', '45789', '36975', '26987', '64721']
print(pred, labels[np.argmax(pred)])
print("\npredicted sentence code is", labels[np.argmax(pred)])

How to increase speed of this ner model implemented from scratch using 1 million labeled sentences

I would like to use spacy's NER model to train a model from scratch using 1 Million sentences. The model has only two types of entities. This is the code I am using. Since, I can't share the data, I created a dummy dataset.
My main issue is that the model is taking too long to train. I would appreciate it if you can highlight any error in my code or suggest other methods to try to fasten training.
TRAIN_DATA = [ ('Ich bin in Bremen', {'entities': [(11, 17, 'loc')]})] * 1000000
import spacy
import random
from spacy.util import minibatch, compounding
def train_spacy(data,iterations):
nlp = spacy.blank('de')
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for itn in range(iterations):
print("Statring iteration " + str(itn))
losses = {}
batches = minibatch(TRAIN_DATA, size=compounding(100, 64.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
print("Losses", losses)
return nlp
model = train_spacy(TRAIN_DATA, 20)
Maybe you can try this:
batches = minibatch(TRAIN_DATA, size=compounding(1, 512, 1.001))
