PyTorch nn.Transformer learns to copy target - pytorch

I’m trying to train a Transformer Seq2Seq model using nn.Transformer class. I believe I am implementing it wrong, since when I train it, it seems to fit too fast, and during inference it repeats itself often. This seems like a masking issue in the decoder, and when I remove the target mask, the training performance is the same. This leads me to believe I am doing the target masking wrong. Here is my model code:
class TransformerModel(nn.Module):
def __init__(self,
vocab_size, input_dim, heads, feedforward_dim, encoder_layers, decoder_layers,
sos_token, eos_token, pad_token, max_len=200, dropout=0.5,
device=(torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))):
super(TransformerModel, self).__init__()
self.target_mask = None
self.embedding = nn.Embedding(vocab_size, input_dim, padding_idx=pad_token)
self.pos_embedding = nn.Embedding(max_len, input_dim, padding_idx=pad_token)
self.transformer = nn.Transformer(
d_model=input_dim, nhead=heads, num_encoder_layers=encoder_layers,
num_decoder_layers=decoder_layers, dim_feedforward=feedforward_dim,
dropout=dropout)
self.out = nn.Sequential(
nn.Linear(input_dim, feedforward_dim),
nn.ReLU(),
nn.Linear(feedforward_dim, vocab_size))
self.device = device
self.max_len = max_len
self.sos_token = sos_token
self.eos_token = eos_token
# Initialize all weights to be uniformly distributed between -initrange and initrange
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
# Generate mask covering the top right triangle of a matrix
def generate_square_subsequent_mask(self, size):
mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def forward(self, src, tgt):
# src: (Max source seq len, batch size, 1)
# tgt: (Max target seq len, batch size, 1)
# Embed source and target with normal and positional embeddings
embedded_src = (self.embedding(src) +
self.pos_embedding(
torch.arange(0, src.shape[1]).to(self.device).unsqueeze(0).repeat(src.shape[0], 1)))
# Generate target mask
target_mask = self.generate_square_subsequent_mask(size=tgt.shape[0]).to(self.device)
embedded_tgt = (self.embedding(tgt) +
self.pos_embedding(
torch.arange(0, tgt.shape[1]).to(self.device).unsqueeze(0).repeat(tgt.shape[0], 1)))
# Feed through model
outputs = self.transformer(src=embedded_src, tgt=embedded_tgt, tgt_mask=target_mask)
outputs = F.log_softmax(self.out(outputs), dim=-1)
return outputs

For those having the same problem, my issue was that I wasn't properly adding the SOS token to the target I was feeding the model, and the EOS token to the target I was using in the loss function.
For reference:
The target fed to the model should be: [SOS] ....
And the target used for the loss should be: .... [EOS]

Related

How can I apply cuda to custom model in pytorch?

The type of inputs is dictionary of tensors. So while training I convert device to cuda to use gpu. And my custom model is like above. Also I assigned cuda to the model.
class EmbeddingLayer(nn.Module):
def __init__(self):
super(EmbeddingLayer, self).__init__()
# other features
self.other_features_embedding = []
for feature_name in OTHER_FEATURES:
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
embedding_dims = int(math.sqrt(len(vocabulary)))
embedding = nn.Embedding(len(vocabulary)+1, embedding_dims)
self.other_features_embedding.append(embedding)
# transformer features
item_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY['item']
self.item_embedding_dims = int(math.sqrt(len(item_vocabulary)))
self.item_embedding = nn.Embedding(len(item_vocabulary)+1, self.item_embedding_dims)
def forward(self, inputs):
# other features
encoded_other_features = []
for i, feature_name in enumerate(OTHER_FEATURES):
embedding = self.other_features_embedding[i](inputs[feature_name])
encoded_other_features.append(embedding)
encoded_other_features = torch.cat(encoded_other_features, -1)
# transformer features
encoded_sequence_item = self.item_embedding(inputs['sequence_item'])
encoded_target_item = self.item_embedding(inputs['target_item'])
positions = inputs['target_timestamp'].repeat(sequence_length-1, 1).transpose(0, 1) - inputs['sequence_timestamp']
encoded_positions = positions.repeat(1, self.item_embedding_dims).reshape(-1, self.item_embedding_dims, sequence_length-1).transpose(1,2)
encoded_sequence_item_with_position = encoded_sequence_item + encoded_positions
encoded_transformer_features = torch.cat((encoded_sequence_item_with_position, encoded_target_item.reshape(-1, 1, self.item_embedding_dims)), 1)
return encoded_other_features, encoded_transformer_features
class BST(nn.Module):
def __init__(self, hidden_units, dropout, num_heads):
super(BST, self).__init__()
...
self.embedding_layer = EmbeddingLayer()
...
def forward(self, inputs):
other_features, transformer_features = self.embedding_layer(inputs)
...
return self.output(features)
model = BST([256, 128], 0.3, 1)
model.to(device)
def train(model, optimizer, dataloader):
model.train()
for inputs in tqdm(dataloader, total=len(dataloader)):
for k, v in inputs.items():
inputs[k] = v.to(device)
model.zero_grad()
pred = model(inputs)
...
But following error occurs:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)
I think the error occurs at embedding in EmbeddingLayer. How can I fix this error to use gpu while training?
Your list of nn.Module is not registering the embedding layers as sub modules of your layer. In order to properly register a list of modules you should use nn.ModuleList. Therefore, you should add the following right after the loop in your __init__ function:
embeddings = []
for feature_name in OTHER_FEATURES:
vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
embedding_dims = int(math.sqrt(len(vocabulary)))
embedding = nn.Embedding(len(vocabulary)+1, embedding_dims)
embeddings.append(embedding)
self.other_features_embedding = nn.ModuleList(embeddings)

PyTorch multi-class: ValueError: Expected input batch_size (416) to match target batch_size (32)

I have created a mutli-class classification neural network. Training, and validation iterators where created with BigBucketIterator method with fields {'text_normalized_tweet':TEXT, 'label': LABEL}
TEXT = a tweet
LABEL = a float number (with 3 values: 0,1,2)
Below I execute a dummy example of my neural network:
import torch.nn as nn
class MultiClassClassifer(nn.Module):
#define all the layers used in model
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
#Constructor
super(MultiClassClassifer, self).__init__()
#embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
#dense layer
self.hiddenLayer = nn.Linear(embedding_dim, hidden_dim)
#Batch normalization layer
self.batchnorm = nn.BatchNorm1d(hidden_dim)
#output layer
self.output = nn.Linear(hidden_dim, output_dim)
#activation layer
self.act = nn.Softmax(dim=1) #2d-tensor
#initialize weights of embedding layer
self.init_weights()
def init_weights(self):
initrange = 1.0
self.embedding.weight.data.uniform_(-initrange, initrange)
def forward(self, text, text_lengths):
embedded = self.embedding(text)
#packed sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
tensor, batch_size = packed_embedded[0], packed_embedded[1]
hidden_1 = self.batchnorm(self.hiddenLayer(tensor))
return self.act(self.output(hidden_1))
Instantiate the model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
OUTPUT_DIM = 3
model = MultiClassClassifer(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
When I call
text, text_lengths = batch.text_normalized_tweet
predictions = model(text, text_lengths).squeeze()
loss = criterion(predictions, batch.label)
it returns,
ValueError: Expected input batch_size (416) to match target batch_size (32).
model(text, text_lengths).squeeze() = torch.Size([416, 3])
batch.label = torch.Size([32])
I can see that the two objects have different sizes, but I have no clue how to fix this?
You may find the Google Colab notebook here
Shapes of each in, out tensor of my forward() method:
torch.Size([32, 10, 100]) #self.embedding(text)
torch.Size([320, 100]) #nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
torch.Size([320, 64]) #self.batchnorm(self.hiddenLayer(tensor))
torch.Size([320, 3]) #self.act(self.output(hidden_1))
You shouldn't be using the squeeze function after the forward pass, that doesn't make sense.
After removing the squeeze function, as you see, the shape of your final output is [320,3] whereas it is expecting [32,3]. One way to fix this is to average out the embeddings you obtain for each word after the self.Embedding function like shown below:
def forward(self, text, text_lengths):
embedded = self.embedding(text)
embedded = torch.mean(embedded, dim=1, keepdim=True)
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
tensor, batch_size = packed_embedded[0], packed_embedded[1]
hidden_1 = self.batchnorm(self.hiddenLayer(tensor))
return self.act(self.output(hidden_1))

A language model with only one embedding layer in both encode and decode only predict <eos>

I'm trying to make the model predict a word from a sentence using pretrained Huggingface's BERT as feature extractor. The model look like this
class BertAutoEncoder(nn.Module):
def __init__(self, vocab_size):
super().__init__()
decoder_layer = nn.TransformerDecoderLayer(768, 2, 1024, dropout=0.1)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, 2)
self.fc = nn.Linear(768, vocab_size)
def forward(self, memory, embedded_word):
output = self.transformer_decoder(embedded_word, memory)
output = self.fc(output)
return output
And when train/evaluate I call the model like this
bert = BertModel.from_pretrained('bert-base-uncased')
bert.requires_grad_(False)
...
memory = bert(**src).last_hidden_state.transpose(0, 1)
embeded_word = bert.embeddings(trg.data['input_ids'][:, :-1], token_type_ids=trg.data['token_type_ids'][:, :-1]).transpose(0, 1)
output = model(memory, embeded_word)
The loss reduced nicely but turned out the model only predict <eos> token.
I tried train the model with 1 batch of 32 samples and it did work when loss reduced pass 8e-6 but when I trained it with all data the loss could go way beyond that but none of the saved models work. Even the one with eval or train loss around 4e-6 - 8e-6.
Surprisingly the model would work if I use a separate decoder's Embedding like this
class BertAutoEncoderOld(nn.Module):
def __init__(self, vocab_size):
super().__init__()
decoder_layer = nn.TransformerDecoderLayer(768, 2, 1024, dropout=0.1)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, 2)
self.decoder = nn.Embedding(vocab_size, 768)
self.pos_decoder = PositionalEncoding(768, 0.5)
self.fc = nn.Linear(768, vocab_size)
def forward(self, memory, word):
tgt = self.decoder(word.data['input_ids'][:, :-1].transpose(0, 1))
tgt = self.pos_decoder(tgt)
output = self.transformer_decoder(tgt, memory)
output = self.fc(output)
return output
But I was asked to make it work with one Embedding and I have no idea how.
I tried
Reduce/increase batch from 32 to 8-64
Also tried 2 and 1024 batch size
Remove <eos> token and change it's attention mask to 0
But none of those work.
What did I do wrong and how to fix it?
Thanks
Edit per #emily qeustion
I change the data itself in collate function
text.data['attention_mask'][text.data['input_ids'] == 102] = 0
text.data['input_ids'][text.data['input_ids'] == 102] = 0
word.data['attention_mask'][word.data['input_ids'] == 102] = 0
word.data['input_ids'][word.data['input_ids'] == 102] = 0
It only used in Bert though.

How to get final hidden state of bidirectional 2-layers GRU in pytorch

I am struggling with understanding how to get hidden layers and concatenate them.
I am using the following code as an example:
class classifier(nn.Module):
#define all the layers used in model
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout):
#Constructor
super().__init__()
self.batch = BATCH_SIZE
self.hidden = hidden_dim
self.layers = n_layers
if(bidirectional):
self.directions = 2
else:
self.directions = 1
#embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
#lstm layer
self.gru = nn.GRU(embedding_dim,
hidden_dim,
num_layers=n_layers,
bidirectional=bidirectional,
dropout=dropout,
batch_first=True)
#dense layer
self.fc = nn.Linear(hidden_dim * 2, output_dim)
#activation function
self.act = nn.Sigmoid()
def forward(self, text, text_lengths):
#text = [batch size,sent_length]
embedded = self.embedding(text)
#embedded = [batch size, sent_len, emb dim]
#packed sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
packed_output, (hidden, cell) = self.lstm(packed_embedded)
#hidden = [batch size, num layers * num directions,hid dim]
#cell = [batch size, num layers * num directions,hid dim]
#concat the final forward and backward hidden state
hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
#hidden = [batch size, hid dim * num directions]
dense_outputs=self.fc(hidden)
#Final activation function
outputs=self.act(dense_outputs)
return outputs
The line hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1) I did not get it.
As per my understanding, I was doing this which did not work.
hidden2 = hidden.view(batch_size,self.layers,self.directions,self.hidden)
hidden2 = torch.cat((hidden2[:,:,0,:],hidden2[:,:,1,:]),dim=1)
dense_outputs=self.fc(hidden2)
Can somebody please explain. I went through PyTorch documentation but did not get.
The shape[0] of hidden output for bidirectional GRU is 2. You should just concat two hidden output on dim=1:
hid_enc = torch.cat([hid_enc[0,:, :], hid_enc[1,:,:]], dim=1).unsqueeze(0)
As the explanation for usage of -1 and -2 as the index , as you know in python lists, the object in index -1 is the last object of the list(second object in our tensor list) and index -2 refers to the object before last object(first object in our case). So the code you did not understand is equivalent to the code in my answer

How can I use LSTM in pytorch for classification?

My code is as below:
class Mymodel(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers, batch_size):
super(Discriminator, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.batch_size = batch_size
self.lstm = nn.LSTM(input_size, hidden_size)
self.proj = nn.Linear(hidden_size, output_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size)),
Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size)))
def forward(self, x):
lstm_out, self.hidden = self.lstm(x, self.hidden)
output = self.proj(lstm_out)
result = F.sigmoid(output)
return result
I want to use LSTM to classify a sentence to good (1) or bad (0). Using this code, I get the result which is time_step * batch_size * 1 but not 0 or 1. How to edit the code in order to get the classification result?
Theory:
Recall that an LSTM outputs a vector for every input in the series. You are using sentences, which are a series of words (probably converted to indices and then embedded as vectors). This code from the LSTM PyTorch tutorial makes clear exactly what I mean (***emphasis mine):
lstm = nn.LSTM(3, 3) # Input dim is 3, output dim is 3
inputs = [autograd.Variable(torch.randn((1, 3)))
for _ in range(5)] # make a sequence of length 5
# initialize the hidden state.
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
autograd.Variable(torch.randn((1, 1, 3))))
for i in inputs:
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden state.
out, hidden = lstm(i.view(1, 1, -1), hidden)
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# *** (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
torch.randn((1, 1, 3)))) # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)
One more time: compare the last slice of "out" with "hidden" below, they are the same. Why? Well...
If you're familiar with LSTM's, I'd recommend the PyTorch LSTM docs at this point. Under the output section, notice h_t is output at every t.
Now if you aren't used to LSTM-style equations, take a look at Chris Olah's LSTM blog post. Scroll down to the diagram of the unrolled network:
As you feed your sentence in word-by-word (x_i-by-x_i+1), you get an output from each timestep. You want to interpret the entire sentence to classify it. So you must wait until the LSTM has seen all the words. That is, you need to take h_t where t is the number of words in your sentence.
Code:
Here's a coding reference. I'm not going to copy-paste the entire thing, just the relevant parts. The magic happens at self.hidden2label(lstm_out[-1])
class LSTMClassifier(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
...
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
self.hidden2label = nn.Linear(hidden_dim, label_size)
self.hidden = self.init_hidden()
def init_hidden(self):
return (autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim)),
autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim)))
def forward(self, sentence):
embeds = self.word_embeddings(sentence)
x = embeds.view(len(sentence), self.batch_size , -1)
lstm_out, self.hidden = self.lstm(x, self.hidden)
y = self.hidden2label(lstm_out[-1])
log_probs = F.log_softmax(y)
return log_probs
The main problem you need to figure out is the in which dim place you should put your batch size when you prepare your data. As far as I know, if you didn't set it in your nn.LSTM() init function, it will automatically assume that the second dim is your batch size, which is quite different compared to other DNN framework. Maybe you can try:
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
like this to ask your model to treat your first dim as the batch dim.
As a last layer you have to have a linear layer for however many classes you want i.e 10 if you are doing digit classification as in MNIST . For your case since you are doing a yes/no (1/0) classification you have two lablels/ classes so you linear layer has two classes. I suggest adding a linear layer as
nn.Linear ( feature_size_from_previous_layer , 2)
and then train the model using a cross-entropy loss.
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

Resources