Lemmatize df column - python-3.x

I am trying to lemmatize content in a df but the function I wrote isn't working. Prior to trying to lemmatize the data in the column looked like this.
Then I ran the following code:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(w) for w in text]
df['content'] = df["content"].apply(lemmatize_text)
print(df.content)
Now the content column looks like this:
I'm not sure what i did wrong, but I am just trying to lemmatize the data in the content column. Any help would be greatly appreciated.

You are lemmatizing each char instead of word. Your function should look like this instead:
def lemmatize_text(text):
lemmatizer = WordNetLemmatizer()
return ' '.join([lemmatizer.lemmatize(w) for w in text.split(' ')])

Related

How to Apply pos_tag to All Words in a Column in Pandas Dataframe

I am trying to apply PoS tagging to a column in a PD dataframe which includes multiple words in each row.
data = {
"col1": ['apartments expectations better','amazing amazingly resolver better', 'been says amazingly', 'places was','airports really beautiful!', 'stopped you']
}
df = pd.DataFrame(data)
The current codes only allow me to apply PoS to first words in each row only.
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def lemmatize_all(sentence):
wnl = WordNetLemmatizer()
for word, tag in pos_tag(word_tokenize(sentence)):
#if tag.startswith("NN"):
if tag.startswith("N"):
return wnl.lemmatize(word, pos='n')
#elif tag.startswith('VB'):
elif tag.startswith('V'):
return wnl.lemmatize(word, pos='v')
#elif tag.startswith('JJ'):
elif tag.startswith('J'):
return wnl.lemmatize(word, pos='a')
elif tag.startswith('R'):
return wnl.lemmatize(word, pos='r')
else:
return word
df['col1']=df['col1'].apply(str).map(lemmatize_all)
print (df)
Current Outputs:
col1
0 apartment
1 amaze
2 be
3 place
4 airport
5 stop
But I want to apply it to all the rest of the words in every row.
Any help would be appreciated.

PhraseMatcher to match in a different token attribute

We would like to match a set of phrases using PhraseMatcher. However we whould like to match not only on the verbatim text, but a normalized version of the input. For instance, lower case, with the accents removed, etc.
We have tried to add a custom attibute to the Token, and use it in the init of the PhraseMatcher to match it but, it did not work.
We could transform the text using a custom pipeline but we want to keep the original text to be able to use other components of spacy.
def deaccent(text):
...
return modified_text
def get_normalization(doc):
return deaccent(doc.text)
Token.set_extension('get_norm', getter=get_normalization)
patterns_ = [{"label": "TECH", "pattern": "java"}]
ruler = EntityRuler(nlp, phrase_matcher_attr="get_norm")
ruler.add_patterns(patterns_)
nlp.add_pipe(ruler)
What is the way to do this?
Since EntityRuler is based on PhraseMatcher, I copy here a working example with Spacy v2.2.0. Follow the comments to understand how to work with "NORM" attribute from tokens.
At the end, you can see how the word "FÁCIL" matches the pattern "facil", since it has been normalized.
import re
import spacy
from unicodedata import normalize
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy.lang.es import Spanish
# Define our custom pipeline component that overwrites the custom attribute "norm" from tokens
class Deaccentuate(object):
def __init__(self, nlp):
self._nlp = nlp
def __call__(self, doc):
for token in doc:
token.norm_ = self.deaccent(token.lower_) # write norm_ attribute!
return doc
#staticmethod
def deaccent(text):
""" Remove accentuation from the given string """
text = re.sub(
r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
normalize("NFD", text), 0, re.I
)
return normalize("NFC", text)
nlp = Spanish()
# Add component to pipeline
custom_component = Deaccentuate(nlp)
nlp.add_pipe(custom_component, first=True, name='normalizer')
# Initialize matcher with patterns to be matched
matcher = PhraseMatcher(nlp.vocab, attr="NORM") # match in norm attribute from token
patterns_ = nlp.pipe(['facil', 'dificil'])
matcher.add('MY_ENTITY', None, *patterns_)
# Run an example and print results
doc = nlp("esto es un ejemplo FÁCIL")
matches = matcher(doc)
for match_id, start, end in matches:
span = Span(doc, start, end, label=match_id)
print("MATCHED: " + span.text)
This bug was fixed in release v2.1.8
https://github.com/explosion/spaCy/issues/4002

Morphological analysis of words with MRJob and Pymorphy2

Can anyone help with the MRJob and Pymorphy2? I am new to python and hadoop. I sort of understood how to perform text tokenisation, but I cannot understand how to morphologically disassemble the resulting tokens using Pymorphy2. Maybe I am doing something obvious wrong, but I do not understand.
This is my code:
from mrjob.job import MRJob
import re, pymorphy2
morph = pymorphy2.MorphAnalyzer()
WORD_RE = re.compile(r"[\w']+")
class MRMorphWord(MRJob):
def mapper(self, _, line):
for word in WORD_RE.findall(line):
yield (word.lower(), 1)
def reducer(self, _, word):
for i in word:
p = morph.parse(word)[0]
yield p
if __name__ == '__main__':
MRMorphWord.run()
Here is the error message:
parse
word_lower = word.lower()
AttributeError: 'generator' object has no attribute 'lower'

How to extract the last names from text file using nltk in python

import re
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
from nltk.corpus import wordnet
inputfile = open('file.txt', 'r')
String= inputfile.read()
def last_name(resume_text):
tokenized_sentences = nltk.sent_tokenize(resume_text)
a_list=[]
for sentence in tokenized_sentences:
a_list=(sentence.split())
s1=a_list[1:]
sentence1=''.join(s1)
tokenized_sentences = nltk.sent_tokenize(sentence1)
for sentence in tokenized_sentences:
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')):
if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
chunk = chunk[0]
(name, tag) = chunk
if tag == 'NOUN':
return name
if __name__ == '__main__':
lastname= last_name(String)
print(lastname)
I want to extract the last name from a resume. It returns the first name correctly but the second name is wrong.
How can I solve this issue?

Getting <generator object <genexpr> at 0x1193417d8> as output

#Reading files with txt extension
def get_sentences():
for root, dirs, files in os.walk("/Users/Documents/test1"):
for file in files:
if file.endswith(".txt"):
x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
for lines in x_.readlines():
yield lines
formoreprocessing = get_sentences()
#Tokenizing sentences of the text files
from nltk.tokenize import sent_tokenize
for i in formoreprocessing:
raw_docs = sent_tokenize(i)
tokenized_docs = [sent_tokenize(i) for sent in raw_docs]
'''Removing Stop Words'''
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = set(stopwords.words("English"))
def strip_stopwords(sentence):
return ' '.join(word for word in sentence.split() if word not in stopset)
stopword_removed_sentences = (strip_stopwords(sentence) for sentence in raw_docs)
print(stopword_removed_sentences)
The above code is not printing what it is supposed to be. Instead it is throwing:
at 0x1193417d8>
as output. What is the mistake here?
I am using python 3.5.
Try print(list(stopword_removed_sentences)). This turns the generator into a list before printing it
This is the final answer, the provides the best result resolving the problem that i have mentioned in my previous comment.
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(''.join(formoreprocessing))
#print(raw_docs)
tokenized_docs = [sent_tokenize(''.join(formoreprocessing)) for sent in raw_docs]
#Removing Stop Words
stopword_removed_sentences = []
from nltk.corpus import stopwords
stopset = set(stopwords.words("English"))
def strip_stopwords(sentence):
return ' '.join(word for word in sentence.split() if word not in stopset)
stopword_removed_sentences = (strip_stopwords(sentence) for sentence in raw_docs)
print(list(stopword_removed_sentences))

Resources