How can I keep multi-word names in tokenization together? - scikit-learn

I want to classify documents using TF-IDF features. One way to do it:
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
import nltk
def tokenize(document):
document = document.lower()
for punct_char in string.punctuation:
document = document.replace(punct_char, " ")
document = re.sub('\s+', ' ', document).strip()
tokens = document.split(" ")
# Contains more than I want:
# from spacy.lang.de.stop_words import STOP_WORDS
stopwords = nltk.corpus.stopwords.words('german')
tokens = [token for token in tokens if token not in stopwords]
return tokens
# How I intend to use it
transformer = TfidfVectorizer(tokenizer=tokenize)
example = "Jochen Schweizer ist eines der interessantesten Unternehmen der Welt, hat den Sitz allerdings nicht in der Schweizerischen Eidgenossenschaft."
transformer.fit([example])
# Example of the tokenizer
print(tokenize(example))
One flaw of this tokenizer is that it splits words that belong together: "Jochen Schweizer" and "schweizerische Eidgenossenschaft". Also lemmatization (word stemming) is missing. I would like to get the following tokens:
["Jochen Schweizer", "interessantesten", "unternehmen", "Welt", "Sitz", "allerdings", "nicht", "Schweizerische Eidgenossenschaft"]
I know that Spacy can identify those named entities (NER):
import en_core_web_sm # python -m spacy download en_core_web_sm --user
parser = en_core_web_sm.load()
doc = parser(example)
print(doc.ents) # (Jochen Schweizer, Welt, Sitz)
Is there a good way to use spacy to tokenize in a way that keeps the named entity words together?

How about this:
with doc.retokenize() as retokenizer:
for ent in doc.ents:
retokenizer.merge(doc[ent.start:ent.end])
In fact, you can use spacy to remove punctuations & stop words, and perform lemmatization too!
parser = spacy.load('de_core_news_sm')
def tokenize(text):
doc = parser(text)
with doc.retokenize() as retokenizer:
for ent in doc.ents:
retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text})
return [x.lemma_ for x in doc if not x.is_punct and not x.is_stop]
Example:
>>> text = "Jochen Schweizer ist eines der interessantesten Unternehmen der Welt, hat den Sitz allerdings nicht in der Schweizerischen Eidgenossenschaft."
>>> print(tokenize(text))
>>> [u'Jochen Schweizer', u'interessant', u'Unternehmen', u'Welt', u'Sitz', u'Schweizerischen Eidgenossenschaft']

Related

Using NLTK, how to search for concepts in a text

I'm novice to both Python and NLTK. So, I'm trying to see the representation of some concepts in text using NLTK. I have a CSV file which looks like this image
And I want to see how frequent, e.g., Freedom, Courage, and all other concepts are. I also want to know how to make sure the code looks for bi and trigrams. However, the code I have below only allows me to look for a single list of words in a text (Preps.txt like this ).
The output I expect is something like:
Concept = Frequency in text, i.e., Freedom = 10, Courage = 20
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = '/Users/Muhsa/Myfolder/Concepts' #this is where the texts I want to study are located
Concepts= PlaintextCorpusReader(corpus_root, '.*')
Concepts.fileids()
for fileid in Concepts.fileids():
text3 = Concepts.words(fileid)
from nltk import word_tokenize
from nltk import FreqDist
text3 = Concepts.words(fileid)
preps = open('preps.txt', encoding="utf-8")
rawpreps = preps.read() #preps refer to the file that has the list of words
tokens = word_tokenize(rawpreps)
texty = nltk.Text(tokens)
fdist = nltk.FreqDist(w.lower() for w in text3)
for m in texty:
print(m + ':', fdist[m], end=' ')
I reorganised your code a little bit. I assumed you had 1 file per concept words, and that 'preps.txt' only contained the courage words but not the others.
I hope it is easy to understand.
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk import word_tokenize
from nltk import FreqDist
# Load the courage vocabulary
with open('preps.txt', encoding="utf-8") as file:
content = file.read() #preps refer to the file that has the list of words
courage_words = content.split('\n') # This is a list of words
# load freedom and development words in the same fashion
# Load the corpus
corpus_root = '/Users/Muhsa/Myfolder/Concepts' #this is where the texts I want to study are located
corpus = PlaintextCorpusReader(corpus_root, '.*')
# Count the number of word in the whole corpus that are also in the courage vocabulry
courage_freq = len([w for w in corpus.words() if w in courage_words])
print('Corpus contains {} courage words'.format(courage_freq))
# For each file in the corpus
for file_id in corpus.fileids():
# Count the number of word in the file that are also in courage word
file_freq = len([w for w in corpus.words(file_id) if w in courage_words])
print(file_id, file_freq)
Or better
# Load concept vocabulary in different files, in a python dictionary
concept_voc = {}
for file_path in ['courage.txt', 'freedom.txt', 'development.txt']:
concept_name = file_path.replace('.txt', '')
with open(file_path) as f:
voc = f.read().split('\n')
concept_voc[concept_name] = voc
# Load concept vocabulary in a csv file, each column is one vocabulary, the first line is the "name"
df = pd.read_csv('to_dict.csv')
convept_voc = df.to_dict('columns')
# concept_voc['courage'] returns the list of courage words
# And then for each concept compute the frequency as before
for concept in concept_voc:
voc = concept_voc[concept]
corpus_freq = len([w for w in corpus.words() if w in voc])
print(concept, '=', corpus_freq)

why is token.pos_ not working while others token.lemma_ etc are working?

I am trying to segreagte the sentences from a text file which has NOUN and VERB in it. But token.pos_ is not working whereas token.lemma_, token.shape_ etc are working.
Hoping to get some help on this one. Below is the part of the code. Thank you in advance.
from spacy.lang.en import English
nlp = spacy.load('en_core_web_sm')
nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp(out_sent)
lis = []
new_lis = []
for d in doc.sents:
lis.append(d)
print(lis)
for sent in lis:
flag = 0
for token in sent:
# token.pos_ doesn't work. token.lemma_, token.shape_ etc works!
#print(token.pos_)
if(token.pos_ == 'NOUN' or token.pos_ == 'VERB'):
flag = 1
if(flag == 1):
new_lis.append(sent)
You should remove the following line from your code snippet:
nlp = English()
because it overwrites the line
nlp = spacy.load('en_core_web_sm')
The latter en_core_web_sm has a pretrained POS tagger, but English() is just a "blank" model that doesn't have such a POS tagger built-in. The model en_core_web_sm also can split sentences using the dependency parse, so there's no need to add the sentencizer to it.

Python (NLTK) - more efficient way to extract noun phrases?

I've got a machine learning task involving a large amount of text data. I want to identify, and extract, noun-phrases in the training text so I can use them for feature construction later on in the pipeline.
I've extracted the type of noun-phrases I wanted from text but I'm fairly new to NLTK, so I approached this problem in a way where I can break down each step in list comprehensions like you can see below.
But my real question is, am I reinventing the wheel here? Is there a faster way to do this that I'm not seeing?
import nltk
import pandas as pd
myData = pd.read_excel("\User\train_.xlsx")
texts = myData['message']
# Defining a grammar & Parser
NP = "NP: {(<V\w+>|<NN\w?>)+.*<NN\w?>}"
chunkr = nltk.RegexpParser(NP)
tokens = [nltk.word_tokenize(i) for i in texts]
tag_list = [nltk.pos_tag(w) for w in tokens]
phrases = [chunkr.parse(sublist) for sublist in tag_list]
leaves = [[subtree.leaves() for subtree in tree.subtrees(filter = lambda t: t.label == 'NP')] for tree in phrases]
flatten the list of lists of lists of tuples that we've ended up with, into
just a list of lists of tuples
leaves = [tupls for sublists in leaves for tupls in sublists]
Join the extracted terms into one bigram
nounphrases = [unigram[0][1]+' '+unigram[1][0] in leaves]
Take a look at Why is my NLTK function slow when processing the DataFrame?, there's no need to iterate through all rows multiple times if you don't need intermediate steps.
With ne_chunk and solution from
NLTK Named Entity recognition to a Python list and
How can I extract GPE(location) using NLTK ne_chunk?
[code]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import pandas as pd
def get_continuous_chunks(text, chunk_func=ne_chunk):
chunked = chunk_func(pos_tag(word_tokenize(text)))
continuous_chunk = []
current_chunk = []
for subtree in chunked:
if type(subtree) == Tree:
current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
df = pd.DataFrame({'text':['This is a foo, bar sentence with New York city.',
'Another bar foo Washington DC thingy with Bruce Wayne.']})
df['text'].apply(lambda sent: get_continuous_chunks((sent)))
[out]:
0 [New York]
1 [Washington, Bruce Wayne]
Name: text, dtype: object
To use the custom RegexpParser :
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import pandas as pd
# Defining a grammar & Parser
NP = "NP: {(<V\w+>|<NN\w?>)+.*<NN\w?>}"
chunker = RegexpParser(NP)
def get_continuous_chunks(text, chunk_func=ne_chunk):
chunked = chunk_func(pos_tag(word_tokenize(text)))
continuous_chunk = []
current_chunk = []
for subtree in chunked:
if type(subtree) == Tree:
current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
df = pd.DataFrame({'text':['This is a foo, bar sentence with New York city.',
'Another bar foo Washington DC thingy with Bruce Wayne.']})
df['text'].apply(lambda sent: get_continuous_chunks(sent, chunker.parse))
[out]:
0 [bar sentence, New York city]
1 [bar foo Washington DC thingy, Bruce Wayne]
Name: text, dtype: object
I suggest referring to this prior thread:
Extracting all Nouns from a text file using nltk
They suggest using TextBlob as the easiest way to achieve this (if not the one that is most efficient in terms of processing) and the discussion there addresses your question.
from textblob import TextBlob
txt = """Natural language processing (NLP) is a field of computer science, artificial intelligence, and computational linguistics concerned with the interactions between computers and human (natural) languages."""
blob = TextBlob(txt)
print(blob.noun_phrases)
The above methods didn't give me the required results. Following is the function that I would suggest
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import RegexpParser
from nltk import Tree
import re
def get_noun_phrases(text):
pos = pos_tag(word_tokenize(text))
count = 0
half_chunk = ""
for word, tag in pos:
if re.match(r"NN.*", tag):
count+=1
if count>=1:
half_chunk = half_chunk + word + " "
else:
half_chunk = half_chunk+"---"
count = 0
half_chunk = re.sub(r"-+","?",half_chunk).split("?")
half_chunk = [x.strip() for x in half_chunk if x!=""]
return half_chunk
The Constituent-Treelib library, which can be installed via: pip install constituent-treelib does excatly what you are looking for in few lines of code. In order to extract noun (or any other) phrases, perform the following steps.
from constituent_treelib import ConstituentTree
# First, we have to provide a sentence that should be parsed
sentence = "I've got a machine learning task involving a large amount of text data."
# Then, we define the language that should be considered with respect to the underlying models
language = ConstituentTree.Language.English
# You can also specify the desired model for the language ("Small" is selected by default)
spacy_model_size = ConstituentTree.SpacyModelSize.Medium
# Next, we must create the neccesary NLP pipeline.
# If you wish, you can instruct the library to download and install the models automatically
nlp = ConstituentTree.create_pipeline(language, spacy_model_size) # , download_models=True
# Now, we can instantiate a ConstituentTree object and pass it the sentence and the NLP pipeline
tree = ConstituentTree(sentence, nlp)
# Finally, we can extract the phrases
tree.extract_all_phrases()
Result...
{'S': ["I 've got a machine learning task involving a large amount of text data ."],
'PP': ['of text data'],
'VP': ["'ve got a machine learning task involving a large amount of text data",
'got a machine learning task involving a large amount of text data',
'involving a large amount of text data'],
'NML': ['machine learning'],
'NP': ['a machine learning task involving a large amount of text data',
'a machine learning task',
'a large amount of text data',
'a large amount',
'text data']}
If you only want the noun phrases, just pick them out with tree.extract_all_phrases()['NP']
['a machine learning task involving a large amount of text data',
'a machine learning task',
'a large amount of text data',
'a large amount',
'text data']

Print only topic name using LDA with python

I need to print only the topic word (only one word). But it contains some number, But I can not get only the topic name like "Happy". My String word is "Happy", why it shows "Happi"
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import string
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()
fr = open('Happy DespicableMe.txt','r')
doc_a = fr.read()
fr.close()
doc_set = [doc_a]
texts = []
for i in doc_set:
raw = i.lower()
tokens = tokenizer.tokenize(raw)
stopped_tokens = [i for i in tokens if not i in en_stop]
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1, id2word = dictionary, passes=20)
rafa = ldamodel.show_topics(num_topics=1, num_words=1, log=False , formatted=False)
print(rafa)
It only shows [(0, '0.142*"happi"')]. But I want to print only the word.
You are plagued by a misunderstanding:
Stemming extracts the stem of a word through a series of transformation rules stripping off common suffixes and prefixes. Indeed, the resulting stem is not necessarily an actual English word. The purpose use of stemming is to normalize words for comparison. E.g.
stem_word('happy') == stem_word('happier')
What you need is a Lemmatizer (e.g. nltk.stem.wordnet) to lookup lemmas. Lemmas differ from stems in that a lemma is a canonical form of the word, while a stem may not be a real word.
After you have install the corpus/wordnet you can use it like this:
from nltk.corpus import wordnet
syns = wordnet.synsets("happier")
print(syns[0].lemmas()[0].name())
Output:
happy

document clustering in python

I am new to both python and scikit-learn, I am going to cluster bunch of text files ( body of NEWS) , I am using the following code :
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import nltk, sklearn, string, os
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
# Preprocessing text with NLTK package
token_dict = {}
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
###########################################################################
# Loading and preprocessing text data
print("\n Loading text dataset:")
path = 'n'
for subdir, dirs, files in (os.walk(path)):
for i,f in enumerate(files):
if f != '.DS_Store':
file_path = subdir + os.path.sep + f
shakes = open(file_path, 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(string.punctuation)
token_dict[f] = no_punctuation
###########################################################################
true_k = 3 # *
print("\n Performing stemming and tokenization...")
vectorizer = TfidfVectorizer(tokenizer=tokenize, encoding='latin-1',
stop_words='english')
X = vectorizer.fit_transform(token_dict.values())
print("n_samples: %d, n_features: %d" % X.shape)
print()
###############################################################################
# Do the actual clustering
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
y=km.fit(X)
print(km)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster %d:" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()
This code is getting the top words. But what document it is and how can I know which original text files belongs to cluster0, cluster1 or cluster2?
To explain a bit more--you can store the cluster allocations using the follow:
clusters = km.labels_.tolist()
This list will be ordered the same as the dict you passed to your vectorizer.
I just put together a guide to document clustering you might find helpful. Let me know if I can explain anything in more detail: http://brandonrose.org/clustering

Resources