Model loss decreases but the performance of model(such as F1-score) does not increase.
I want to fine-tune a pertained language model XLM from Facebook to do NER tasks, so I linked a BiLSTM and CRF.
This is my model architecture. The entire code repo have been uploaded to github https://github.com/stefensa/XLM_NER
class XLM_BiLSTM_CRF(nn.Module):
def __init__(self, config, num_labels, params, dico, reloaded):
super().__init__()
self.config = config
self.num_labels = num_labels
self.batch_size = config.batch_size
self.hidden_dim = config.hidden_dim
self.xlm = TransformerModel(params, dico, True, True)
self.xlm.eval()
self.xlm.load_state_dict(reloaded['model'])
self.lstm = nn.LSTM(config.embedding_dim, config.hidden_dim // 2,
num_layers=1, bidirectional=True)
self.dropout = nn.Dropout(config.dropout)
self.classifier = nn.Linear(config.hidden_dim, config.num_class)
self.apply(self.init_bert_weights)
self.crf = CRF(config.num_class)
def forward(self, word_ids, lengths, langs=None, causal=False):
sequence_output = self.xlm('fwd', x=word_ids, lengths=lengths, causal=False).contiguous()
sequence_output, _ = self.lstm(sequence_output)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
return self.crf.decode(logits)
def log_likelihood(self, word_ids, lengths, tags):
sequence_output = self.xlm('fwd', x=word_ids, lengths=lengths, causal=False).contiguous()
sequence_output, _ = self.lstm(sequence_output)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
return - self.crf(logits, tags.transpose(0,1))
def init_bert_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
This is the initial state of my model.
And this is the 9th epoch performance of my model. The metrics do not change.
Can anyone solve my problem?
Related
I was trying to make RNN chatbot with PyTorch but : return F.embedding(
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.9/site-packages/torch/nn/functional.py", line 2043, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
IndexError: index out of range in self
Kod bu:
import torch
import torch.nn as nn
import torch.nn.functional as F
import requests
from bs4 import BeautifulSoup
url = "https://barisozcan.com"
try:
response = requests.get(url)
except:
print("Bağlantını kontrol et!\n")
exit()
soup = BeautifulSoup(response.text, "html.parser")
# Define a recurrent neural network
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(input_size, hidden_size)
self.lstm = nn.LSTM(hidden_size, hidden_size)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, input, hidden):
x = self.embedding(input.view(1, -1))
x, hidden = self.lstm(x.view(1, 1, -1), hidden)
x = self.fc(x.view(1, -1))
return x, hidden
def init_hidden(self):
return (torch.zeros(1, 1, self.hidden_size),
torch.zeros(1, 1, self.hidden_size))
# Define a chatbot class
class ChatBot(object):
def __init__(self, input_size, hidden_size, num_classes, conversations, responses):
self.rnn = RNN(input_size, hidden_size, num_classes)
self.word2index = {}
self.index2word = {}
self.word_to_vec_map = {}
self.conversations = conversations
self.responses = responses
self.words = []
# Prepare data for training the recurrent neural network
for conv in conversations:
for word in conv.split():
if word not in self.word2index:
self.word2index[word] = len(self.words)
self.index2word[len(self.words)] = word
self.words.append(word)
for resp in responses:
for word in resp.split():
if word not in self.word2index:
self.word2index[word] = len(self.words)
self.index2word[len(self.words)] = word
self.words.append(word)
# Define a random word embedding for each word
for i in range(len(self.words)):
self.word_to_vec_map[self.words[i]] = torch.randn(input_size)
def get_vec(self, word):
return self.word_to_vec_map[word]
def predict(self, conversation):
words = conversation.split()
inputt = []
for word in words:
if word in self.word2index:
inputt.append(self.word2index[word])
else:
print("Error: '{}' is not in the word embedding vocabulary.".format(word))
return None
# Pass the encoded conversation through the recurrent neural network
hidden = self.rnn.init_hidden()
for i in range(len(inputt)):
output, hidden = self.rnn(torch.tensor([inputt[i]]).view(1, -1), hidden)
# Use the hidden state to predict the next word
index = output.argmax().item()
return self.index2word[index]
# Define some example conversations and responses
#konuşmalar
conversations = [text for text in soup.stripped_strings]
n_con=[]
for c in conversations:
n_con.append(c.lower())
conversations.clear()
conversations=n_con
print(conversations)
n_con=None
#yanıtlar yapay zeka sanırım bunu kullanıyor
responses=conversations
# Create a chatbot
input_size = len(conversations)
hidden_size = 512
num_classes = len(conversations) + len(responses)
chatbot = ChatBot(input_size, hidden_size, num_classes, conversations, responses)
# Test the chatbot
while True:
conversation = input("Sen: ")
if conversation == "quit" or conversation=="exit":
break
try:
response = chatbot.predict(conversation)
print("FazAI: ", response)
except KeyError:
print("Key Error hatası: '{}'.Kelime haznesinde olmadığı için yazdığın metindeki kelimleler veya kelime hata veriyor.\n".format(conversation))
continue
How can I resolve this error?
What code do I need to replace with which code to fix this?
This code is a badly trained chatbot with certain phrases. It was supposed to chat with me.
I wrote the following code to solve a Seq2Seq regression problem. My implementation is based on the GRU and multi-head attention. The performance is horrible. I tried playing with the hyperparameters, but nothing changed. This led me to think it was a network architecture issue.
class Seq2Seq(nn.Module):
def __init__(self, input_size, output_size, hidden, num_heads):
super(Seq2Seq, self).__init__()
self.encoder = nn.GRU(input_size, hidden, 2)
self.decoder = nn.GRU(hidden, hidden, 2)
self.multihead_attn = nn.MultiheadAttention(hidden, num_heads)
self.linear = nn.Linear(hidden, output_size)
self.init_weights()
def init_weights(self):
self.linear.weight.data.normal_(0, 0.1)
def forward(self, x):
encoded, _ = self.encoder(x)
decoded, _ = self.decoder(encoded)
attention_output, _ = self.multihead_attn(decoded, decoded, decoded)
out = self.linear(attention_output)
return out
D_in = 4
D_out = 1
hidden = 16
num_heads = 4
seq2seq = Seq2Seq(input_size=D_in, output_size=D_out, hidden=hidden, num_heads=num_heads)
inputs = torch.rand((7, 100, D_in))
outputs = seq2seq(inputs)
Coming from TensorFlow background, I am trying to convert a snippet of code of the custom layer from Keras to PyTorch.
The custom layer in Keras looks like this:
class Attention_module(tf.keras.layers.Layer):
def __init__(self, class_num):
super(Attention_module,self).__init__(class_num)
self.class_num = class_num
self.Ws = None
def build(self, input_shape):
embedding_length = int(input_shape[2])
self.Ws = self.add_weight(shape=(self.class_num, embedding_length),
initializer=tf.keras.initializers.get('glorot_uniform'), trainable=True)
super(Attention_module, self).build(input_shape)
def call(self, inputs):
sentence_trans = tf.transpose(inputs, [0, 2, 1])
at = tf.matmul(self.Ws, sentence_trans)
at = tf.math.tanh(at)
at = K.exp(at - K.max(at, axis=-1, keepdims=True))
at = at / K.sum(at, axis=-1, keepdims=True)
v = K.batch_dot(at, inputs)
return v
I want to implement the same in the torch; I have already done the forward pass block but am confused about how to do the embedding and weight initialization the same as the above layer in PyTorch?
class Attention_module(torch.nn.Module):
def __init__(self, class_num):
# how to initialize weight with same as above keras layer?
def forward(self, inputs):
sentence_trans = inputs.permute(0, 2, 1)
at = torch.mm(self.Ws, sentence_trans)
at = torch.nn.Tanh(at)
at = torch.exp(at - torch.max(torch.Tensor(at), dim=-1, keepdims=True).values)
at = at / torch.sum(at, dim = -1, keepdims=True)
v = torch.einsum('ijk,ikl->ijl', at, inputs)
return v
Thank you!
class Attention_module(torch.nn.Module):
def __init__(self, class_num, input_shape):
super().__init__()
self.class_num = class_num
embedding_length = int(input_shape[2])
self.Ws = torch.nn.Embedding(num_embeddings=class_num,
embedding_dim=embedding_length) # Embedding layer
torch.nn.init.xavier_uniform_(self.Ws.weight) # Glorot initialization
Here's the reference for layer initialization methods. Xavier init is another name for Glorot init.
The _ at the end of torch.nn.init.xavier_uniform_ is a pytorch convention that signifies an inplace operation.
You can also use torch.nn.init at runtime. It doesn't have to be within __init__(). Like:
att = Attention_module(class_num, input_shape)
torch.nn.init.xavier_uniform_(att.Ws.weight)
or :
for param in att.parameters():
torch.nn.init.xavier_uniform_(param)
I have two neural networks running in parallel. Each gives a features map of same size say Nx1. Now I want weighted average of these embedding like this w1 * embed1 + w2 * embed2. I have tried these 1 2.But the weights are not updating. Any help would be appreciated. Here is how I am trying to do it:
class LinearWeightedAvg(nn.Module):
def __init__(self, n_inputs):
super(LinearWeightedAvg, self).__init__()
self.weight1 = Variable(torch.randn(1), requires_grad=True).cuda()
self.weight2 = Variable(torch.randn(1), requires_grad=True).cuda()
def forward(self, inp_embed):
return self.weight1 * inp_embed[0] + self.weight2 * inp_embed[1]
class EmbedBranch(nn.Module):
def __init__(self, feat_dim, embedding_dim):
super(EmbedBranch, self).__init__()
fc_layer1 = fc_layer
def forward(self, x):
x = self.fc_layer1(x)
return x
class EmbeddingNetwork(nn.Module):
def __init__(self, args, N):
super(EmbeddingNetwork, self).__init__()
embedding_dim = N
self.embed1 = EmbedBranch(N, N)
self.embed2 = EmbedBranch(N, N)
self.comb_branch = LinearWeightedAvg(metric_dim)
self.args = args
if args.cuda:
self.cuda()
def forward(self, emb1, emb2):
embeds1 = self.text_branch(emb1)
embeds2 = self.image_branch(emb2)
combined = self.comb_branch([embeds1, embeds2])
return combined
def train_forward(self, embed1, embed2):
combined = self(embed1, embed2)
embeds = model.train_forward(embed1, embed2)
loss = loss_func(embeds, labels)
running_loss.update(loss.data.item())
optimizer.zero_grad()
loss.backward()
Also I want the weight to be within 0-1 range.
Thanks,
You should use self.weightx = torch.nn.Parameter(your_inital_tensor) to register a tensor as a learnable parameter of the model.
So, I am used to use PyTorch and now decided to give Skorch a shot.
Here they define the network as
class ClassifierModule(nn.Module):
def __init__(
self,
num_units=10,
nonlin=F.relu,
dropout=0.5,
):
super(ClassifierModule, self).__init__()
self.num_units = num_units
self.nonlin = nonlin
self.dropout = dropout
self.dense0 = nn.Linear(20, num_units)
self.nonlin = nonlin
self.dropout = nn.Dropout(dropout)
self.dense1 = nn.Linear(num_units, 10)
self.output = nn.Linear(10, 2)
def forward(self, X, **kwargs):
X = self.nonlin(self.dense0(X))
X = self.dropout(X)
X = F.relu(self.dense1(X))
X = F.softmax(self.output(X), dim=-1)
return X
I prefer inputting lists of neurons in each layer i.e num_units=[30,15,5,2] would have 2 hidden layers with 15 and 5 neurons. Furthermore we have 30 features and 2 classes, thus re-writing it to something like this
class Net(nn.Module):
def __init__(
self,
num_units=[30,15,5,2],
nonlin=[F.relu,F.relu,F.relu],
dropout=[0.5,0.5,0.5],
):
super(Net, self).__init__()
self.layer_units = layer_units
self.nonlin = nonlin #Activation function
self.dropout = dropout #Drop-out rates in each layer
self.layers = [nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])] #Dense layers
def forward(self, X, **kwargs):
print("Forwards")
for layer,func,drop in zip(self.layers[:-1],self.nonlin,self.dropout):
print(layer,func,drop)
X=drop(func(layer(X)))
X = F.softmax(X, dim=-1)
return X
should do the trick. The problem is that when calling
net = NeuralNetClassifier(Net,max_epochs=20,lr=0.1,device="cuda")
net.fit(X,y)
I get the error "ValueError: optimizer got an empty parameter list". I have narrowed it down to the removal of self.output = nn.Linear(10, 2) simply makes the net not enter forward i.e it seems like output is some kind of "trigger" variable. Is that really the case the network need a variable called output (being a layer) at the end, and that we are not free to define the variable-names ourself ?
Pytorch will look for subclasses of nn.Module, so changing
self.layers = [nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])]
to
self.layers = nn.ModuleList([nn.Linear(i,p) for i,p in zip(layer_units,layer_units[1:])])
should work fine