Get the start and end position of found named entities - python-3.x

I am very new to ML and also Spacy in general. I am trying to show Named Entities from an input text.
This is my method:
def run():
nlp = spacy.load('en_core_web_sm')
sentence = "Hi my name is Oliver!"
doc = nlp(sentence)
#Threshold for the confidence socres.
threshold = 0.2
beams = nlp.entity.beam_parse(
[doc], beam_width=16, beam_density=0.0001)
entity_scores = defaultdict(float)
for beam in beams:
for score, ents in nlp.entity.moves.get_beam_parses(beam):
for start, end, label in ents:
entity_scores[(start, end, label)] += score
#Create a dict to store output.
ners = defaultdict(list)
ners['text'] = str(sentence)
for key in entity_scores:
start, end, label = key
score = entity_scores[key]
if (score > threshold):
ners['extractions'].append({
"label": str(label),
"text": str(doc[start:end]),
"confidence": round(score, 2)
})
pprint(ners)
The above method works fine, and will print something like:
'extractions': [{'confidence': 1.0,
'label': 'PERSON',
'text': 'Oliver'}],
'text': 'Hi my name is Oliver'})
So far so good. Now I am trying to get the actual position of the found named entity. In this case "Oliver".
Looking at the documentation, there is: ent.start_char, ent.end_char available, but if I use that:
"start_position": doc.start_char,
"end_position": doc.end_char
I get the following error:
AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'start_char'
Can someone guide me in the right direction?

If someone has come here wanting a simple answer to the question, I believe the following should do it:
nlp = spacy.load('en_core_web_sm')
sentence = "Hi my name is Oliver!"
doc = nlp(sentence)
for ent in doc.ents:
print(f"Entity {ent} found with start at {ent.start_char} and end at {ent.end_char}")

So I actually found an answer right after posting this question (typical).
I found that I didn't need to save the information into entity_scores, but instead just iterate over the actual found entities ent:
I ended up adding for ent in doc.ents: instead and this gives me access to all the standard Spacy attributes. See below:
ners = defaultdict(list)
ners['text'] = str(sentence)
for beam in beams:
for score, ents in nlp.entity.moves.get_beam_parses(beam):
for ent in doc.ents:
if (score > threshold):
ners['extractions'].append({
"label": str(ent.label_),
"text": str(ent.text),
"confidence": round(score, 2),
"start_position": ent.start_char,
"end_position": ent.end_char
My entire method ends up looking like this:
def run():
nlp = spacy.load('en_core_web_sm')
sentence = "Hi my name is Oliver!"
doc = nlp(sentence)
threshold = 0.2
beams = nlp.entity.beam_parse(
[doc], beam_width=16, beam_density=0.0001)
ners = defaultdict(list)
ners['text'] = str(sentence)
for beam in beams:
for score, ents in nlp.entity.moves.get_beam_parses(beam):
for ent in doc.ents:
if (score > threshold):
ners['extractions'].append({
"label": str(ent.label_),
"text": str(ent.text),
"confidence": round(score, 2),
"start_position": ent.start_char,
"end_position": ent.end_char
})

Related

Token indices sequence length is longer than the specified maximum sequence length for this model (28627 > 512)

I am using BERT's Huggingface DistilBERT model as a backend for a question and answer application. The text I am using with which to train the model is one very large single text field. Even though the text field is a single string, the punctuation was left in place as a clue for BERT. When I execute the application I am getting the "Token indices sequence length error". I am using the transformer.encodeplus() method to pass the text into the model. I have tried various mechanisms to truncate the input ids to a length <= to 512.
I am currently using Windows 10 but I will also be porting the code to a Raspberry Pi 4 platform.
The code is failing at this line:
start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
I am attempting to perform the truncation at this line:
encoding = tokenizer.encode_plus(question, tokenizer(context, truncation=True).input_ids)
The entire code is here:
from transformers import AutoTokenizer, DistilBertTokenizer, DistilBertForQuestionAnswering
import torch
# globals - set once used everywhere
tokenizer = None
model = None
context = ''
def establishSettings():
global tokenizer, model, context
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=True, model_max_length=512)
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', return_dict=False)
# context = "Some 1,500 volcanoes are still considered potentially active around the world today 161 of those over 10 percent sit within the boundaries of the United States."
# get the volcano corpus
with open('volcanic.corpus', encoding="utf8") as file:
context = file.read().replace('\n', '')
print(len(tokenizer(context, truncation=True).input_ids))
def askQuestion(question):
global tokenizer, model, context
print("\nQuestion ", question)
encoding = tokenizer.encode_plus(question, tokenizer(context, truncation=True).input_ids)
input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
ans_tokens = input_ids[torch.argmax(start_scores): torch.argmax(end_scores) + 1]
answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens, skip_special_tokens=True)
#all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
return answer_tokens
def main():
# set the global itmes once
establishSettings()
# ask a question
question = "How many potentially active volcanoes are there in the world today?"
answer_tokens = askQuestion(question)
print("answer_tokens: ", answer_tokens)
if len(answer_tokens) == 0:
answer = "Sorry, I don't have an answer for that one. Ask me another question about New Mexico volcanoes."
print(answer)
else:
answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
print("\nFinal Answer : ")
print(answer_tokens_to_string)
if __name__ == '__main__':
main()
What is the best way to truncate the input.ids to <= 512 in length.
Edit this line:
encoding = tokenizer.encode_plus(question, tokenizer(context, truncation=True).input_ids)
to
encoding = tokenizer.encode_plus(question, tokenizer(context, truncation=True, max_length=512).input_ids)

POS-tagging a sentence using NLTK

I would like to pos tag a sentence using the NLTK library in python.
I am using the following couple of lines of code and it works fine:
>>> text = word_tokenize("And now for something completely different")
>>> nltk.pos_tag(text)
[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
('completely', 'RB'), ('different', 'JJ')]
However, I would like to output the POS as properties of a node class variable (ie. sentence).
For instance, I would like to have my output for a sentence like "james ate ..." be like
sentence.noun = “james”
sentence.verb = “ate”
sentence.adjective = “ … “
Any idea on how my code should change?
In order to that you need to create a Sentence class which has attributes.
class Sentence:
def __init__(self, text):
self.text = text
self.noun = None
self.verb = None
self.adjective = None
text = "And now for something completely different"
tokens = word_tokenize(text)
s = Sentence(text)
for w, t in nltk.pos_tag(text):
if t == 'NN':
s.noun = w
elif t == 'VB':
s.verb = w
# etc ...
With this approach you cannot have multiple verbs in your sentence.
Depending on your goal you can check spacy which offers high level processing of string (you can access named entities and noun_phrases for example). Or maybe you can check the task of dependency parsing (example here) from which you can extract phrases and which verb relates to which subject etc...

Spacy ent.label_ cannot define organization

I am using spacy to analyze the terrorist and it is weird that spacy cannot find the organization such as fatah. The code is below
import spacy
nlp = spacy.load('en')
def read_file_to_list(file_name):
with open(file_name, 'r') as file:
return file.readlines()
terrorism_articles = read_file_to_list('data/rand-terrorism-dataset.txt')
terrorism_articles_nlp = [nlp(art) for art in terrorism_articles]
common_terrorist_groups = [
'taliban',
'al - qaeda',
'hamas',
'fatah',
'plo',
'bilad al - rafidayn'
]
common_locations = [
'iraq',
'baghdad',
'kirkuk',
'mosul',
'afghanistan',
'kabul',
'basra',
'palestine',
'gaza',
'israel',
'istanbul',
'beirut',
'pakistan'
]
location_entity_dict = defaultdict(Counter)
for article in terrorism_articles_nlp:
article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者组织
article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE']
terrorist_common = [ent for ent in article_terrorist_groups if ent in common_terrorist_groups]
locations_common = [ent for ent in article_locations if ent in common_locations]
for found_entity in terrorist_common:
for found_location in locations_common:
location_entity_dict[found_entity][found_location] += 1
location_entity_dict
I simply get nothing from the file.
Here is The text data link
Thank you!
I reproduced your example and it looks like you will get empty lists for article_terrorist_groups and terrorist_common. Therefore, you won't get the output (that I assume) you require. I changed the model (for my machine) to en_core_web_sm and I observed that the ent.label is different from ones that you are specifying in the if statement in your list comprehensions. I am almost certain this is the case whether you use spacy.load('en') or spacy.load('en_core_web_sm').
You are using if ent.label_=='PERSON' or ent.label_ =='ORG' which is leading to empty lists. You would need to change this in order for it to work. Basically, in your list comprehension for article_terrorist_groups and terrorist_common, the for loop is trying to iterate through an empty list.
If you look at the output that I posted, you will see that ent.label is not 'PERSON' or 'ORG'
Note: I would recommend adding print statements (or using a debugger) in your code to check from time to time.
My Code
import spacy
from collections import defaultdict, Counter
nlp = spacy.load('en_core_web_sm') # I changed this
def read_file_to_list(file_name):
with open(file_name, 'r') as file:
return file.readlines()
terrorism_articles = read_file_to_list('rand-terrorism-dataset.txt')
terrorism_articles_nlp = [nlp(art) for art in terrorism_articles]
common_terrorist_groups = [
'taliban',
'al - qaeda',
'hamas',
'fatah',
'plo',
'bilad al - rafidayn'
]
common_locations = [
'iraq',
'baghdad',
'kirkuk',
'mosul',
'afghanistan',
'kabul',
'basra',
'palestine',
'gaza',
'israel',
'istanbul',
'beirut',
'pakistan'
]
location_entity_dict = defaultdict(Counter)
for article in terrorism_articles_nlp:
print([(ent.lemma_, ent.label) for ent in article.ents])
Output
[('CHILE', 383), ('the Santiago Binational Center', 383), ('21,000', 394)]
[('ISRAEL', 384), ('palestinian', 381), ('five', 397), ('Masada', 384)]
[('GUATEMALA', 383), ('U.S. Marines', 381), ('Guatemala City', 384)]
truncated output in the interest of length of this answer
Because groups & locations in common_terrorist_groups and common_locations are lowercase while finded data terrorist_common and locations_common are uppercase. So just change the code if ent in common_terrorist_groups to if ent.lower() in common_terrorist_groups
common_terrorist_groups = [
'taliban',
'al - qaeda',
'hamas',
'fatah',
'plo',
'bilad al - rafidayn'
]
common_locations = [
'iraq',
'baghdad',
'kirkuk',
'mosul',
'afghanistan',
'kabul',
'basra',
'palestine',
'gaza',
'israel',
'istanbul',
'beirut',
'pakistan'
]
location_entity_dict = defaultdict(Counter)
for article in terrorism_articles_nlp:
article_terrorist_cands = [ent.lemma_ for ent in article.ents if ent.label_ == 'PERSON' or ent.label_ == 'ORG']
article_location_cands = [ent.lemma_ for ent in article.ents if ent.label_ == 'GPE']
terrorist_candidates = [ent for ent in article_terrorist_cands if ent.lower() in common_terrorist_groups]
location_candidates = [loc for loc in article_location_cands if loc.lower() in common_locations]
for found_entity in terrorist_candidates:
for found_location in location_candidates:
location_entity_dict[found_entity][found_location] += 1

Python Error : CountVectorizer.fit - AttributeError: 'list' object has no attribute 'lower'

I'm getting error message from this command line "cv.fit (bigdf ['Description'])"
I noticed that this error is happening after I created the Tokenize function and RemoveStopWords sees that the line that returns in the panda is now like this
['PS4', 'SpiderMan'], ['XBOX', 'SpiderMan'], ['XBOX', 'Blackops 4']
before an entire sentence (here is the fit command is working, before created Tokenize(sentence) and StopWord(setence) )
['PS4 SpiderMan'], ['XBOX SpiderMan'], ['XBOX Blackops 4']
Is there any way to get the fit with tokenized values, or some way of converting these tokens to a sentence? Because I am using stemming and stopword library in Portuguese
def StemmingPortuguese(sentence):
phrase = []
for word in sentence:
phrase.append(stemmer.stem(word))
return phrase
def RemoveStopWords(sentence):
return [word for word in sentence if word not in stopwords]
def TreatPortuguese(sentence):
return StemmingPortuguese(RemoveStopWords(Tokenize(remove_accents(sentence))))
def Tokenize(sentence):
sentence = sentence.lower()
sentence = nltk.word_tokenize(sentence)
return sentence
trainData = []
for name in files:
if name.endswith(".txt"):
#print(os.path.basename(name))
trainData.append(pd.read_csv(os.path.basename(name), converters={'Description':TreatPortuguese}, quoting=csv.QUOTE_NONE, delimiter=";", error_bad_lines=False, names = ["Product", "Description", "Brand", "CategoryID"])) #, error_bad_lines=False
bigdf = pd.concat(trainData)
print(bigdf['CategoryID'].value_counts())
print(bigdf[:2])
cv = CountVectorizer(analyzer="word")
cv.fit(bigdf['Description'])

How to get frequencies of topics of NMF in sklearn

I am now using NMF to generate topics. My code is shown below. However, I do not know how to get the frequency of each topic. Does anyone that can help me? Thank you!
def fit_tfidf(documents):
tfidf = TfidfVectorizer(input = 'content', stop_words = 'english',
use_idf = True, ngram_range = NGRAM_RANGE,lowercase = True, max_features = MAX_FEATURES, min_df = 1 )
tfidf_matrix = tfidf.fit_transform(documents.values).toarray()
tfidf_feature_names = np.array(tfidf.get_feature_names())
tfidf_reverse_lookup = {word: idx for idx, word in enumerate(tfidf_feature_names)}
return tfidf_matrix, tfidf_reverse_lookup, tfidf_feature_names
def vectorization(documments):
if VECTORIZER == 'tfidf':
vec_matrix, vec_reverse_lookup, vec_feature_names = fit_tfidf(documents)
if VECTORIZER == 'bow':
vec_matrix, vec_reverse_lookup, vec_feature_names = fit_bow(documents)
return vec_matrix, vec_reverse_lookup, vec_feature_names
def nmf_model(vec_matrix, vec_reverse_lookup, vec_feature_names, NUM_TOPICS):
topic_words = []
nmf = NMF(n_components = NUM_TOPICS, random_state=3).fit(vec_matrix)
for topic in nmf.components_:
word_idx = np.argsort(topic)[::-1][0:N_TOPIC_WORDS]
topic_words.append([vec_feature_names[i] for i in word_idx])
return topic_words
If you mean the frequency of each topic inside each documents, then:
H = nmf.fit_transform(vec_matrix)
H is a matrix of shape (n_documents, n_topics). Each row represents a document vector (in the topic space). In this vector you find the weight that each topic has (which translates as the topic importance).

Resources