When training a BERT based model one can set num_labels
AutoConfig.from_pretrained(BERT_MODEL_NAME, num_labels=num_labels)
So for example if we want to have a prediction of 3 values we may use num_labels=3.
My question is what does it do internally? Is it just connecting a nn.Linear to the last embedding layer?
Thanks
I suppose if there is a num label then the model is used for classification then simply you can go to the documentation of BERT on hugging face then search for the classification class and take a look into the code, then you will find the following:
https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/bert/modeling_bert.py#L1572
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
so the number of labels as we see affects using the loss function
hope this answers your question
Related
I'm using this AutoModelForQuestionAnswering from the transformers for semantic search. Therefore i could not find a way to evaluate it knowing that i'm do not have a predicted values and i don't have a train and test data.Here's the code.
def load_model():
model_name = "camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
return model, tokenizer
model,tokenizer= load_model()
#input_ids are the indices corresponding to each token in the sentence.
#input_ids = tokenizer.encode(search, text)
encoded_output = tokenizer.encode_plus(text=search, text_pair=text, padding=True)
input_ids = torch.tensor(encoded_output['input_ids'])
attention_mask = torch.tensor(encoded_output['attention_mask'])
token_type_ids = torch.tensor(0)
tokens = tokenizer.convert_ids_to_tokens(input_ids)
# Batch size of 1
input_ids = input_ids.unsqueeze(0)
token_type_ids = token_type_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)
print("this is the encode output", encoded_output)
print("this is token_types_ids",token_type_ids)
print("this is attention mask",attention_mask)
print("this is input_ids",input_ids)
print("this is tokens", tokens)
output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
print( model.eval() )
# 3. GET THE ANSWER SPAN
# once we have the most likely start and end tokens, we grab all the tokens between them
# and convert tokens back to words!
# tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits) # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
answer = " ".join(tokens[answer_start:answer_end + 1])
else:
answer = "No Answer"
return answer
'''
I'm trying to get started with reinforcement learning using OpenAI Gym. I tried to tackle the Hotter-Colder exercise (https://gym.openai.com/envs/HotterColder-v0/).
For the action space, i am trying to pass a Box space to ensure it is a continuous space. Even though i am specifying the type as int32, when i train the model via model.learn it always gets the values as float32 between 0 and 2.5.
As you can see in the code below, the action_space is specified with a Box as int32 but during the training and prediction phase, the action value is always array[float32]. Furthermore, rather than get values between 1.0 and 100.0, the values seem to be stuck only between 0.0 and 2.5. Anyone know how to solve this?
Thank you very much.
Here is my code:
class HotterColder(Env):
def __init__(self):
self.range = 100
self.guess_max = 100
self.number = 0
self.guess_count = 0
self.action_space = Box(low=1,high=100,shape=(1,),dtype=np.int32)
self.observation_space = Discrete(4)
self.state = 0
np.random.seed(0)
def reset(self):
self.number = np.random.randint(low=1, high=self.range)
self.guess_count = 0
self.observation = 0
return self.state
def render(self):
pass
def step(self, action):
guess =int( action[0])
if guess < self.number:
self.state = 1
elif guess > self.number:
self.state = 3
else:
self.state = 2
self.guess_count += 1
done = self.guess_count >= self.guess_max
reward = ((min(guess, self.number) + self.range) / (max(guess, self.number) + self.range))**2
info = {"guess": guess, "actual": self.number, "guesses": self.guess_count, "reward": reward, "state": self.state}
if done:
if guess == self.number:
print("Correct guess." + str(info))
return self.state, reward, done, info
I'm using PyTorch Lightning to write a simple trainer, but when I try to run the trainer, for some reason, 9 out of 10 times it returns "CUDA error: device-side assert." Simply printing a newline before it somehow seems to make it work. Any ideas?
My code:
class Elementwise(nn.ModuleList):
"""
A simple network container.
Parameters are a list of modules.
Inputs are a 3d Tensor whose last dimension is the same length
as the list.
Outputs are the result of applying modules to inputs elementwise.
An optional merge parameter allows the outputs to be reduced to a
single Tensor.
"""
def __init__(self, merge=None, *args):
assert merge in [None, 'first', 'concat', 'sum', 'mlp']
self.merge = merge
super(Elementwise, self).__init__(*args)
def forward(self, inputs):
inputs_ = [feat.squeeze(1) for feat in inputs.split(1, dim=1)]
for i, j in enumerate(inputs_):
inp = torch.tensor(j).to(device).long()
inputs_[i] = inp
# this does not work
outputs = [f(x) for i, (f, x) in enumerate(zip(self, inputs_))]
if self.merge == 'first':
return outputs[0]
elif self.merge == 'concat' or self.merge == 'mlp':
return torch.cat(outputs, 1)
elif self.merge == 'sum':
return sum(outputs)
else:
return outputs
but somehow magically this works:
class Elementwise(nn.ModuleList):
"""
A simple network container.
Parameters are a list of modules.
Inputs are a 3d Tensor whose last dimension is the same length
as the list.
Outputs are the result of applying modules to inputs elementwise.
An optional merge parameter allows the outputs to be reduced to a
single Tensor.
"""
def __init__(self, merge=None, *args):
assert merge in [None, 'first', 'concat', 'sum', 'mlp']
self.merge = merge
super(Elementwise, self).__init__(*args)
def forward(self, inputs):
inputs_ = [feat.squeeze(1) for feat in inputs.split(1, dim=1)]
for i, j in enumerate(inputs_):
inp = torch.tensor(j).to(device).long()
inputs_[i] = inp
print("")
outputs = [f(x) for i, (f, x) in enumerate(zip(self, inputs_))]
if self.merge == 'first':
return outputs[0]
elif self.merge == 'concat' or self.merge == 'mlp':
return torch.cat(outputs, 1)
elif self.merge == 'sum':
return sum(outputs)
else:
return outputs
Any idea as to how this error gets fixed by simply printing to output?
Edit: This error is only raised when using PyTorch Lightning for training abstraction, using plain PyTorch makes it work fine.
What is the proper approach to unsupervised comparison of semantic similarity between two short text corpora? Comparing LDA topic distributions for the two doesn't seem to be a solution, as for short documents the generated topics do not really grasp the semantics very well. Chunking didn't help, because following tweets don't have to be on the same topic. Is e.g. creating a matrix of cosine similarities between document TF-IDFs in these corpora a good way to go?
Here is one approach found here. The higher the similarity score, the closer the sentences are(semantically).
#Invoke libraries
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
#Build functions to compute similarity
def ptb_to_wn(tag):
if tag.startswith('N'):
return 'n'
if tag.startswith('V'):
return 'v'
if tag.startswith('J'):
return 'a'
if tag.startswith('R'):
return 'r'
return None
def tagged_to_synset(word, tag):
wn_tag = ptb_to_wn(tag)
if wn_tag is None:
return None
try:
return wn.synsets(word, wn_tag)[0]
except:
return None
def sentence_similarity(s1, s2):
s1 = pos_tag(word_tokenize(s1))
s2 = pos_tag(word_tokenize(s2))
synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in s1]
synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in s2]
#suppress "none"
synsets1 = [ss for ss in synsets1 if ss]
synsets2 = [ss for ss in synsets2 if ss]
score, count = 0.0, 0
for synset in synsets1:
best_score = max([synset.path_similarity(ss) for ss in synsets2])
if best_score is not None:
score += best_score
count += 1
# Average the values
score /= count
return score
#compute the symmetric sentence similarity
def symSentSim(s1, s2):
sss_score = (sentence_similarity(s1, s2) + sentence_similarity(s2,s1)) / 2
return (sss_score)
s1 = 'We rented a vehicle to drive to New York'
s2 = 'The car broke down on our jouney'
s1tos2 = symSentSim(s1, s2)
print(s1tos2)
#0.142509920635
I have been working in a business problem where i need to find a similarity of new document with existing one.
I have used various approach as below
1.Bag of words + Cosine similarity
2.TFIDF + Cosine similarity
3.Word2Vec + Cosine similarity
None of them worked as expected.
But finally i found an approach which works better its
Word2vec + Soft cosine similarity
But the new challenge is i ended up with multiple documents with same similarity score.
Most of them are relevant but few of them even though having some semantically similar words they are different
Please suggest how to over come this issue
If the objective is to identify semantic similarity, the following code sourced from here helps.
#invoke libraries
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
#Build functions
def ptb_to_wn(tag):
if tag.startswith('N'):
return 'n'
if tag.startswith('V'):
return 'v'
if tag.startswith('J'):
return 'a'
if tag.startswith('R'):
return 'r'
return None
def tagged_to_synset(word, tag):
wn_tag = ptb_to_wn(tag)
if wn_tag is None:
return None
try:
return wn.synsets(word, wn_tag)[0]
except:
return None
def sentence_similarity(s1, s2):
s1 = pos_tag(word_tokenize(s1))
s2 = pos_tag(word_tokenize(s2))
synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in s1]
synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in s2]
#suppress "none"
synsets1 = [ss for ss in synsets1 if ss]
synsets2 = [ss for ss in synsets2 if ss]
score, count = 0.0, 0
for synset in synsets1:
best_score = max([synset.path_similarity(ss) for ss in synsets2])
if best_score is not None:
score += best_score
count += 1
# Average the values
score /= count
return score
#Build function to compute the symmetric sentence similarity
def symSentSim(s1, s2):
sss_score = (sentence_similarity(s1, s2) + sentence_similarity(s2,s1)) / 2
return (sss_score)
#Example
s1 = 'We rented a vehicle to drive to Goa'
s2 = 'The car broke down on our jouney'
s1tos2 = symSentSim(s1, s2)
print(s1tos2)
#0.155753968254