I am working on an NLP assignment and loaded the GloVe vectors provided by Gensim:
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-twitter-25')
I am trying to get the word embedding for each word in a sentence, but some of them are not in the vocabulary.
What is the best way to deal with it working with the Gensim API?
Thanks!
Load the model:
import gensim.downloader as api
model = api.load("glove-twitter-25") # load glove vectors
# model.most_similar("cat") # show words that similar to word 'cat'
There is a very simple way to find out if the words exist in the model's vocabulary.
result = print('Word exists') if word in model.wv.vocab else print('Word does not exist")
Apart from that, I had used the following logic to create sentence embedding (25 dim) with N tokens:
from __future__ import print_function, division
import os
import re
import sys
import regex
import numpy as np
from functools import partial
from fuzzywuzzy import process
from Levenshtein import ratio as lev_ratio
import gensim
import tempfile
def vocab_check(model, word):
similar_words = model.most_similar(word)
match_ratio = 0.
match_word = ''
for sim_word, sim_score in similar_words:
ratio = lev_ratio(word, sim_word)
if ratio > match_ratio:
match_word = sim_word
if match_word == '':
return similar_words[0][1]
return model.similarity(word, match_word)
def sentence2vector(model, sent, dim=25):
words = sent.split(' ')
emb = [model[w.strip()] for w in words]
weights = [1. if w in model.wv.vocab else vocab_check(model, w) for w in words]
if len(emb) == 0:
sent_vec = np.zeros(dim, dtype=np.float16)
else:
sent_vec = np.dot(weights, emb)
sent_vec = sent_vec.astype("float16")
return sent_vec
I'm working on tensorflow2 but I'm facing error on importing tokenization like below .
I have alredy tried pip3 install some tensorflow version 1.0~2.0 and the packages of tokenizer and tokenization but still not working . could you give me advice to solve this error ??
error message
Traceback (most recent call last):
File "_count_tokenization.py", line 26, in <module>
my_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path)
AttributeError: module 'tokenization' has no attribute 'FullTokenizer'
import tokenization
import codecs
import numpy as np
vocab_path = "./model_ch/vocab.txt"
max_seq_length = 128
file0 = "./task/message.tsv"
f0 = codecs.open(file0, "r", "utf-8")
lines = f0.readlines()
f0.close()
len_file = len(lines)
count = np.zeros([len_file])
count0 = np.zeros([len_file])
my_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path)
#file1 = "./task_data_ch/%s_count.tsv" % filename
file1 = "./task/message_count.tsv"
f1 = codecs.open(file1, "w", "utf-8")
f1.write("%s\t%s\t%s\r\n" % ("label","count","count_truncated"))
for i in range(1,len_file):
a = lines[i]
a = a.split("\t")
text = a[1]
token = my_tokenizer.tokenize(text)
print(token)
count[i] = len(token) + 2 # for [CLS] and [SEP]
if count[i] > max_seq_length:
count0[i] = max_seq_length
else:
count0[i] = count[i]
f1.write("%s\t%s\t%s\n" % (i-1,int(count[i]),int(count0[i])))
sum0 = int(np.sum(count0))
sum1 = int(np.sum(count))
print(sum0, sum1)
print(int(len_file-1))
f1.write("Total: %s, %s" % (sum1,sum0))
f1.close()
the result of pip3 list
tensorboard 2.2.1
tensorboard-plugin-wit 1.6.0.post3
tensorflow-estimator 2.2.0
tokenization 1.0.7
tokenizer 2.0.5
The below piece of code will enable TF 2.0 for you.
# Colab has two versions of TensorFlow installed: a 1.x version and a 2.xversion.
# Collab currently uses TF 1.x by default
# To enable TF2 to execute the following code
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)
Then, Import the nltk with specific requirements such as stop word, tokenization, etc.
import nltk
nltk.download("popular")
from nltk.tokenize import word_tokenize
And the below code will tokenize your sentences and if you want your sentences to be tokenized that can also be done using
tokens = sent_tokenize("Your paragraphs or multiple sentences")
text = "I love NLP and I will learn NLP in 2 months"
words = nltk.word_tokenize(text)
words
multilabel classification
I am trying to predict a multilabel classification using scikit-learn/pandas/OneVsRestClassifier/logistic regression. Building and evaluating the model works but attempting to classify new sample text does not.
scenario 1:
Once I build a model saved the model with the name(sample.pkl) and restarting my kernel, but when I load the saved model(sample.pkl) during prediction on sample text getting its giving error:
NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.
I build the model and evaluate the model and i save it the model wtith the name sample.pkl. i restrat my kernal then i load the model making prediction on sample text NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted
inference
import pickle,os
import collections
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
import json, nltk, re, csv, pickle
from sklearn.metrics import f1_score # performance matrix
from sklearn.multiclass import OneVsRestClassifier # binary relavance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
stop_words = set(stopwords.words('english'))
def cleanHtml(sentence):
'''' remove the tags '''
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', str(sentence))
return cleantext
def cleanPunc(sentence):
''' function to clean the word of any
punctuation or special characters '''
cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
cleaned = cleaned.strip()
cleaned = cleaned.replace("\n"," ")
return cleaned
def keepAlpha(sentence):
""" keep the alpha sentenes """
alpha_sent = ""
for word in sentence.split():
alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
alpha_sent += alpha_word
alpha_sent += " "
alpha_sent = alpha_sent.strip()
return alpha_sent
def remove_stopwords(text):
""" remove stop words """
no_stopword_text = [w for w in text.split() if not w in stop_words]
return ' '.join(no_stopword_text)
test1 = pd.read_csv("C:\\Users\\abc\\Downloads\\test1.csv")
test1.columns
test1.head()
siNo plot movie_name genre_new
1 The story begins with Hannah... sing [drama,teen]
2 Debbie's favorite band is Dream.. the bigeest fan [drama]
3 This story of a Zulu family is .. come back,africa [drama,Documentary]
getting Error
I am getting the error here when iam inference on sample text
def infer_tags(q):
q = cleanHtml(q)
q = cleanPunc(q)
q = keepAlpha(q)
q = remove_stopwords(q)
multilabel_binarizer = MultiLabelBinarizer()
tfidf_vectorizer = TfidfVectorizer()
q_vec = tfidf_vectorizer.transform([q])
q_pred = clf.predict(q_vec)
return multilabel_binarizer.inverse_transform(q_pred)
for i in range(5):
print(i)
k = test1.sample(1).index[0]
print("Movie: ", test1['movie_name'][k], "\nPredicted genre: ", infer_tags(test1['plot'][k])), print("Actual genre: ",test1['genre_new'][k], "\n")
solved
I solved the i save tfidf and multibiniraze into pickle model
from sklearn.externals import joblib
pickle.dump(tfidf_vectorizer, open("tfidf_vectorizer.pickle", "wb"))
pickle.dump(multilabel_binarizer, open("multibinirizer_vectorizer.pickle", "wb"))
vectorizer = joblib.load('/abc/downloads/tfidf_vectorizer.pickle')
multilabel_binarizer = joblib.load('/abc/downloads/multibinirizer_vectorizer.pickle')
def infer_tags(q):
q = cleanHtml(q)
q = cleanPunc(q)
q = keepAlpha(q)
q = remove_stopwords(q)
q_vec = vectorizer .transform([q])
q_pred = rf_model.predict(q_vec)
return multilabel_binarizer.inverse_transform(q_pred)
i go though the below link i got the solution
,How do I store a TfidfVectorizer for future use in scikit-learn?>
This happens because you are only dumping the classifier into the pickle and not the vectorizer.
During inference, when you call
tfidf_vectorizer = TfidfVectorizer()
, your vectorizer is not fitted on the training vocabulary, which is giving the error.
What you should do is, dump both the classifier and the vectorizer to pickle. Load them both during inference.
I have been trying to remove stopwords using python 3 code but my code does not seem to work,I want to know how to remove stop words from the below list. The example structure is as below:
from nltk.corpus import stopwords
word_split1=[['amazon','brand','-
','solimo','premium','almonds',',','250g','by','solimo'],
['hersheys','cocoa', 'powder', ',', '225g', 'by', 'hersheys'],
['jbl','t450bt','extra','bass','wireless','on-
ear','headphones','with','mic','white','by','jbl','and']]
I am trying to remove stop words and tried the below is my code and i would appreciate if anyone can help me rectify the issue.. here is the code below
stop_words = set(stopwords.words('english'))
filtered_words=[]
for i in word_split1:
if i not in stop_words:
filtered_words.append(i)
I get error:
Traceback (most recent call last):
File "<ipython-input-451-747407cf6734>", line 3, in <module>
if i not in stop_words:
TypeError: unhashable type: 'list'
You have a list of lists.
Try:
word_split1=[['amazon','brand','- ','solimo','premium','almonds',',','250g','by','solimo'],['hersheys','cocoa', 'powder', ',', '225g', 'by', 'hersheys'],['jbl','t450bt','extra','bass','wireless','on-ear','headphones','with','mic','white','by','jbl','and']]
stop_words = set(stopwords.words('english'))
filtered_words=[]
for i in word_split1:
for j in i:
if j not in stop_words:
filtered_words.append(j)
or flatten your list.
Ex:
from itertools import chain
word_split1=[['amazon','brand','- ','solimo','premium','almonds',',','250g','by','solimo'],['hersheys','cocoa', 'powder', ',', '225g', 'by', 'hersheys'],['jbl','t450bt','extra','bass','wireless','on-ear','headphones','with','mic','white','by','jbl','and']]
stop_words = set(stopwords.words('english'))
filtered_words=[]
for i in chain.from_iterable(word_split1):
if i not in stop_words:
filtered_words.append(i)
or
filtered_words = [i for i in chain.from_iterable(word_split1) if i not in stop_words]
The list is a 2D array and you're trying to hash a list, convert it to a 1D array first, then your code would work fine,
word_split1 = [j for x in word_split1 for j in x]
stop_words = set(stopwords.words('english'))
filtered_words=[]
for i in word_split1:
if i not in stop_words:
filtered_words.append(i)
I have parsed 30 excel files and created a pandas dataframe. I have tokenized the words, taken out stop words and made bigrams. However when I try to lemmatize it gives me this error: TypeError: unhashable type: 'list'
Here's my code:
# Use simple pre-proces to clean up data and tokenize
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data_words = list(sent_to_words(data))
# Define Function for Removing stopwords
def remove_stopwords(texts):
return[[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
# Define function for bigrams
def make_bigrams(texts):
return[bigram_mod[doc] for doc in texts]
#Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
#Define function for lemmatizing
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma(word):
return WordNetLemmatizer().lemmatize(word)
#Lemmatize words
data_lemmatized = get_lemma(data_words_bigrams)
This is exactly where I get the error. How should I adjust my code to resolve this issue? Thank you in advance
as suggested, the first few lines of the dataframe
df.head()
dataframe snap