How to use the same vocabulary to vectorize a new comment? - scikit-learn

Hello I am working with sklearn I have a list that looks as follows:
list = ["comment1","comment2",...,"commentN"]
then I builded a vectorizer to build a matrix,
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
max_features=n_features,stop_words=stpw)
I used fit_transform to vectorize this list
tf = tf_vectorizer.fit_transform(list)
I builded 8 clusters of data,
kmeans = KMeans(n_clusters=8, random_state=0).fit(tf)
and finally I used the method called predict to produce labels to every vector
y_pred = kmeans.predict(tf)
Now I have a new comment that I would like to associate with a cluster of my previos data,
comment = ["newComment"]
I tried, first vectorizing the comment to then use predict as follows:
newVec = CountVectorizer(vocabulary=tf.vocabulary_)
testComment = newVec.fit_transform(comment)
y_pred_Comment = kmeans.predict(comment)
print(y_pred_Comment)
The problem is that I am getting errors since this new vectorizer called newVec, is not taking all of my previos vocabulary,
I would like to appreciate help to vectorize my new comment but using the same model produced previously by tf_vectorizer.fit_transform(list),
Error associated:
<ipython-input-32-69c8879d551a> in <module>()
129
130
--> 131 newVec = CountVectorizer(vocabulary=tf.vocabulary_)
132
133 comment = ["newComment"]
C:\Program Files\Anaconda3\lib\site-packages\scipy\sparse\base.py in __getattr__(self, attr)
557 return self.getnnz()
558 else:
--> 559 raise AttributeError(attr + " not found")
560
561 def transpose(self, axes=None, copy=False):
AttributeError: vocabulary_ not found

I think you've run into a slight misunderstanding of how the models are used in scikit. You want to train a model on your training set and then you apply the same model to your testing set. So in your example (but using the newsgroups data instead)
from sklearn import datasets, feature_extraction, neighbors, cluster
newsgroups_train = datasets.fetch_20newsgroups(subset='train').data[:200]
newsgroups_test = datasets.fetch_20newsgroups(subset='test').data[:100]
tf_vectorizer = feature_extraction.text.CountVectorizer()
tf_train = tf_vectorizer.fit_transform(newsgroups_train)
kmeans = cluster.KMeans(n_clusters=8, random_state=0).fit(tf)
y_pred = kmeans.predict(tf_train)
Now we have a vectoriser and a clustering model we can apply it to new data.
tf_test = tf_vectorizer.transform(newsgroups_test)
y_pred_test = kmeans.predict(tf_test)

Related

Cannot interpret SVM model using Shapash

Currently, I'm exploring machine learning interpretability tools for one of my project. I found Shapash quite a new tool and many people suggesting to use it to create a few easily interpretable charts for ML model. When I tried it with RandomForestClassifier it worked fine and generate a webpage full of different charts but the same I cannot achieve while using SVM(just exploring this library, not focusing on the perfect ML model for a problem).
Note - using Shapash link here
#Fit blackbox model
svc = svm.SVC()
svc.fit(X_train_smote, y_train_smote)
y_pred = svc.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")
from shapash import SmartExplainer
xpl = SmartExplainer(model=svc)
error which I'm getting -
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
/tmp/ipykernel_13648/1233939729.py in <module>
----> 1 xpl = SmartExplainer(model=svc)
~/Python_AI/ai_env/lib/python3.8/site-packages/shapash/explainer/smart_explainer.py in __init__(self, model, backend, preprocessing, postprocessing, features_groups, features_dict, label_dict, title_story, palette_name, colors_dict, **kwargs)
194 if isinstance(backend, str):
195 backend_cls = get_backend_cls_from_name(backend)
--> 196 self.backend = backend_cls(
197 model=self.model, preprocessing=preprocessing, **kwargs)
198 elif isinstance(backend, BaseBackend):
~/Python_AI/ai_env/lib/python3.8/site-packages/shapash/backend/shap_backend.py in __init__(self, model, preprocessing, explainer_args, explainer_compute_args)
16 self.explainer_args = explainer_args if explainer_args else {}
17 self.explainer_compute_args = explainer_compute_args if explainer_compute_args else {}
---> 18 self.explainer = shap.Explainer(model=model, **self.explainer_args)
19
20 def run_explainer(self, x: pd.DataFrame) -> dict:
~/Python_AI/ai_env/lib/python3.8/site-packages/shap/explainers/_explainer.py in __init__(self, model, masker, link, algorithm, output_names, feature_names, **kwargs)
166 # if we get here then we don't know how to handle what was given to us
167 else:
--> 168 raise Exception("The passed model is not callable and cannot be analyzed directly with the given masker! Model: " + str(model))
169
170 # build the right subclass
Exception: The passed model is not callable and cannot be analyzed directly with the given masker! Model: SVC()

How to calculate perplexity of a sentence using huggingface masked language models?

I have several masked language models (mainly Bert, Roberta, Albert, Electra). I also have a dataset of sentences. How can I get the perplexity of each sentence?
From the huggingface documentation here they mentioned that perplexity "is not well defined for masked language models like BERT", though I still see people somehow calculate it.
For example in this SO question they calculated it using the function
def score(model, tokenizer, sentence, mask_token_id=103):
tensor_input = tokenizer.encode(sentence, return_tensors='pt')
repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
masked_input = repeat_input.masked_fill(mask == 1, 103)
labels = repeat_input.masked_fill( masked_input != 103, -100)
loss,_ = model(masked_input, masked_lm_labels=labels)
result = np.exp(loss.item())
return result
score(model, tokenizer, '我爱你') # returns 45.63794545581973
However, when I try to use the code I get TypeError: forward() got an unexpected keyword argument 'masked_lm_labels'.
I tried it with a couple of my models:
from transformers import pipeline, BertForMaskedLM, BertForMaskedLM, AutoTokenizer, RobertaForMaskedLM, AlbertForMaskedLM, ElectraForMaskedLM
import torch
1)
tokenizer = AutoTokenizer.from_pretrained("bioformers/bioformer-cased-v1.0")
model = BertForMaskedLM.from_pretrained("bioformers/bioformer-cased-v1.0")
2)
tokenizer = AutoTokenizer.from_pretrained("sultan/BioM-ELECTRA-Large-Generator")
model = ElectraForMaskedLM.from_pretrained("sultan/BioM-ELECTRA-Large-Generator")
This SO question also used the masked_lm_labels as an input and it seemed to work somehow.
There is a paper Masked Language Model Scoring that explores pseudo-perplexity from masked language models and shows that pseudo-perplexity, while not being theoretically well justified, still performs well for comparing "naturalness" of texts.
As for the code, your snippet is perfectly correct but for one detail: in recent implementations of Huggingface BERT, masked_lm_labels are renamed to simply labels, to make interfaces of various models more compatible. I have also replaced the hard-coded 103 with the generic tokenizer.mask_token_id. So the snippet below should work:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np
model_name = 'cointegrated/rubert-tiny'
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def score(model, tokenizer, sentence):
tensor_input = tokenizer.encode(sentence, return_tensors='pt')
repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
with torch.inference_mode():
loss = model(masked_input, labels=labels).loss
return np.exp(loss.item())
print(score(sentence='London is the capital of Great Britain.', model=model, tokenizer=tokenizer))
# 4.541251105675365
print(score(sentence='London is the capital of South America.', model=model, tokenizer=tokenizer))
# 6.162017238332462
You can try this code in Google Colab by running this gist.

Huggingface Transformers NER - Offset Mapping Causing ValueError in NumPy boolean array indexing assignment

I was trying out the NER tutorial Token Classification with W-NUT Emerging Entities (https://huggingface.co/transformers/custom_datasets.html#tok-ner) in google colab using the Annotated Corpus for Named Entity Recognition data on Kaggle (https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus?select=ner_dataset.csv).
I will outline my process in detail to facilitate an understanding of what I was doing and to let the community help me figure out the source of the indexing assignment error.
To load the data from google drive where I have saved it, I used the following code
# import pandas library
import pandas as pd
# columns to select
cols_to_select = ["Sentence #", "Word", "Tag"]
# google drive data path
data_path = '/content/drive/MyDrive/Colab Notebooks/ner/ner_dataset.csv'
# load the data from google colab
dataset = pd.read_csv(data_path, encoding="latin-1")[cols_to_select].fillna(method = 'ffill')
I run the following code to parse the sentences and tags
class SentenceGetter(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(agg_func)
self.sentences = [s for s in self.grouped]
def retrieve(self):
try:
s = self.grouped["Sentence: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
# get full data
getter = SentenceGetter(dataset)
# get sentences
sentences = [[s[0] for s in sent] for sent in getter.sentences]
# get tags/labels
tags = [[s[1] for s in sent] for sent in getter.sentences]
# take a look at the data
print(sentences[0][0:5], tags[0][0:5], sep='\n')
I then split the data into train, val, and test sets
# import the sklearn module
from sklearn.model_selection import train_test_split
# split data in to temp and test sets
temp_texts, test_texts, temp_tags, test_tags = train_test_split(sentences,
tags,
test_size=0.20,
random_state=15)
# split data into train and validation sets
train_texts, val_texts, train_tags, val_tags = train_test_split(temp_texts,
temp_tags,
test_size=0.20,
random_state=15)
After splitting the data, I created encodings for tags and the tokens
unique_tags=dataset.Tag.unique()
# create tags to id
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
# create id to tags
id2tag = {id: tag for tag, id in tag2id.items()}
I then installed the transformer library in colab
# install the transformers library
! pip install transformers
Next I imported the small bert model
# import the transformers module
from transformers import BertTokenizerFast
# import the small bert model
model_name = "google/bert_uncased_L-4_H-512_A-8"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
I then created the encodings for the tokens
# create train set encodings
train_encodings = tokenizer(train_texts,
is_split_into_words=True,
return_offsets_mapping=True,
padding=True,
max_length=128,
truncation=True)
# create validation set encodings
val_encodings = tokenizer(val_texts,
is_split_into_words=True,
return_offsets_mapping=True,
padding=True,
max_length=128,
truncation=True)
# create test set encodings
test_encodings = tokenizer(test_texts,
is_split_into_words=True,
return_offsets_mapping=True,
padding=True,
max_length=128,
truncation=True)
In the tutorial, it uses offset-mapping to handle the problem that arise with word-piece tokenization, specifically, the mismatch between tokens and labels. It is when running the offset-mapping code in the tutorial that I get the error. Below is the offset mapping function used in the tutorial:
# the offset function
import numpy as np
def encode_tags(tags, encodings):
labels = [[tag2id[tag] for tag in doc] for doc in tags]
encoded_labels = []
for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
# create an empty array of -100
doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
arr_offset = np.array(doc_offset)
# set labels whose first offset position is 0 and the second is not 0
doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
encoded_labels.append(doc_enc_labels.tolist())
return encoded_labels
# return the encoded labels
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
test_labels = encode_tags(test_tags, test_encodings)
After running the above code, it gives me the following error, and I can't figure out where the source of the error lies. Any help and pointers would be appreciated.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-19-afdff0186eb3> in <module>()
17
18 # return the encoded labels
---> 19 train_labels = encode_tags(train_tags, train_encodings)
20 val_labels = encode_tags(val_tags, val_encodings)
21 test_labels = encode_tags(test_tags, test_encodings)
<ipython-input-19-afdff0186eb3> in encode_tags(tags, encodings)
11
12 # set labels whose first offset position is 0 and the second is not 0
---> 13 doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
14 encoded_labels.append(doc_enc_labels.tolist())
15
ValueError: NumPy boolean array indexing assignment cannot assign 38 input values to the 37 output values where the mask is true

How to get feature importances/feature ranking from summary plot in SHAP without crashing?

I am attempting to get shap values out of an array which was created by
explainer = shap.Explainer(xg_clf, X_train)
shap_values2 = explainer(X_train)
using my XGBoost data, to make a dataframe of feature_names and their SHAP importance, as they would appear in a SHAP bar or summary plot.
Following advice from how to extract the most important feature names? and How to get feature names of shap_values from TreeExplainer? specifically the comment by user Thoo, which shows how the values can be extracted to make a dataframe:
vals= np.abs(shap_values).mean(0)
feature_importance = pd.DataFrame(list(zip(X_train.columns,vals)),columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
feature_importance.head()
shap_values has 11595 persons with 595 features each, which I understand is large, but, creating the vals variable runs very slowly, about 58 minutes on my laptop. It uses almost all RAM on the computer.
After 58 minutes I get an error:
Command terminated by signal 9
which as far as I understand, means that the computer ran out of RAM.
I've tried converting the 2nd line in Thoo's code to
feature_importance = pd.DataFrame(list(zip(X_train.columns,np.abs(shap_values2).mean(0))),columns=['col_name','feature_importance_vals'])
so that vals isn't stored but this change doesn't reduce RAM at all.
I've also tried a different comment from the same GitHub issue (user "ba1mn"):
def global_shap_importance(model, X):
""" Return a dataframe containing the features sorted by Shap importance
Parameters
----------
model : The tree-based model
X : pd.Dataframe
training set/test set/the whole dataset ... (without the label)
Returns
-------
pd.Dataframe
A dataframe containing the features sorted by Shap importance
"""
explainer = shap.Explainer(model)
shap_values = explainer(X)
cohorts = {"": shap_values}
cohort_labels = list(cohorts.keys())
cohort_exps = list(cohorts.values())
for i in range(len(cohort_exps)):
if len(cohort_exps[i].shape) == 2:
cohort_exps[i] = cohort_exps[i].abs.mean(0)
features = cohort_exps[0].data
feature_names = cohort_exps[0].feature_names
values = np.array([cohort_exps[i].values for i in range(len(cohort_exps))])
feature_importance = pd.DataFrame(
list(zip(feature_names, sum(values))), columns=['features', 'importance'])
feature_importance.sort_values(
by=['importance'], ascending=False, inplace=True)
return feature_importance
but global_shap_importance returns the feature importances in the wrong order, and I don't see how I can alter global_shap_importance so that the features are returned in the same order as summary_plot (beeswarm plot).
How can I get the feature importance ranking into a dataframe?
I pulled this straight from the source code. Confirmed identical to the summary_plot.
def shapley_feature_ranking(shap_values, X):
feature_order = np.argsort(np.mean(np.abs(shap_values), axis=0))
return pd.DataFrame(
{
"features": [X.columns[i] for i in feature_order][::-1],
"importance": [
np.mean(np.abs(shap_values), axis=0)[i] for i in feature_order
][::-1],
}
)
shapley_feature_ranking(shap_values[0], X)

Retraining pre-trained word embeddings in Python using Gensim

I want to retrain pre-trained word embeddings in Python using Gensim. The pre-trained embeddings I want to use is Google's Word2Vec in the file GoogleNews-vectors-negative300.bin.
Following Gensim's word2vec tutorial, "it’s not possible to resume training with models generated by the C tool, load_word2vec_format(). You can still use them for querying/similarity, but information vital for training (the vocab tree) is missing there."
Therefore I can't use the KeyedVectors and for training a model the tutorial suggests to use:
model = gensim.models.Word2Vec.load('/tmp/mymodel')
model.train(more_sentences)
(https://rare-technologies.com/word2vec-tutorial/)
However, when I try this:
from gensim.models import Word2Vec
model = Word2Vec.load('data/GoogleNews-vectors-negative300.bin')
I get an error message:
1330 # Because of loading from S3 load can't be used (missing readline in smart_open)
1331 if sys.version_info > (3, 0):
-> 1332 return _pickle.load(f, encoding='latin1')
1333 else:
1334 return _pickle.loads(f.read())
UnpicklingError: invalid load key, '3'.
I didn't find a way to convert the binary google new file into a text file properly, and even if so I'm not sure whether that would solve my problem.
Does anyone have a solution to this problem or knows about a different way to retrain pre-trained word embeddings?
The Word2Vec.load() method can only load full models in gensim's native format (based on Python object-pickling) – not any other binary/text formats.
And, as per the documentation's note that "it’s not possible to resume training with models generated by the C tool", there's simply not enough information in the GoogleNews raw-vectors files to reconstruct the full working model that was used to train them. (That would require both some internal model-weights, not saved in that file, and word-frequency-information for controlling sampling, also not saved in that file.)
The best you could do is create a new Word2Vec model, then patch some/all of the GoogleNews vectors into it before doing your own training. This is an error-prone process with no real best-practices and many caveats about the interpretation of final results. (For example, if you bring in all the vectors, but then only re-train a subset using only your own corpus & word-frequencies, the more training you do – making the word-vectors better fit your corpus – the less such re-trained words will have any useful comparability to retained untrained words.)
Essentially, if you can look at the gensim Word2Vec source & work-out how to patch-together such a frankenstein-model, it may be appropriate. But there's no built-in support or handy off-the-shelf recipes that make it easy, because it's an inherently murky process.
I have already answered it here .
Save the google news model as text file in wor2vec format using gensim.
Refer this answer to save it as text file
Then try this code .
import os
import pickle
import numpy as np
import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
import operator
os.mkdir("model_dir")
# class EpochSaver(CallbackAny2Vec):
# '''Callback to save model after each epoch.'''
# def __init__(self, path_prefix):
# self.path_prefix = path_prefix
# self.epoch = 0
# def on_epoch_end(self, model):
# list_of_existing_files = os.listdir(".")
# output_path = 'model_dir/{}_epoch{}.model'.format(self.path_prefix, self.epoch)
# try:
# model.save(output_path)
# except:
# model.wv.save_word2vec_format('model_dir/model_{}.bin'.format(self.epoch), binary=True)
# print("number of epochs completed = {}".format(self.epoch))
# self.epoch += 1
# list_of_total_files = os.listdir(".")
# saver = EpochSaver("my_finetuned")
# function to load vectors from existing model.
# I am loading glove vectors from a text file, benefit of doing this is that I get complete vocab of glove as well.
# If you are using a previous word2vec model I would recommed save that in txt format.
# In case you decide not to do it, you can tweak the function to get vectors for words in your vocab only.
def load_vectors(token2id, path, limit=None):
embed_shape = (len(token2id), 300)
freqs = np.zeros((len(token2id)), dtype='f')
vectors = np.zeros(embed_shape, dtype='f')
i = 0
with open(path, encoding="utf8", errors='ignore') as f:
for o in f:
token, *vector = o.split(' ')
token = str.lower(token)
if len(o) <= 100:
continue
if limit is not None and i > limit:
break
vectors[token2id[token]] = np.array(vector, 'f')
i += 1
return vectors
# path of text file of your word vectors.
embedding_name = "word2vec.txt"
data = "<training data(new line separated tect file)>"
# Dictionary to store a unique id for each token in vocab( in my case vocab contains both my vocab and glove vocab)
token2id = {}
# This dictionary will contain all the words and their frequencies.
vocab_freq_dict = {}
# Populating vocab_freq_dict and token2id from my data.
id_ = 0
training_examples = []
file = open("{}".format(data),'r', encoding="utf-8")
for line in file.readlines():
words = line.strip().split(" ")
training_examples.append(words)
for word in words:
if word not in vocab_freq_dict:
vocab_freq_dict.update({word:0})
vocab_freq_dict[word] += 1
if word not in token2id:
token2id.update({word:id_})
id_ += 1
# Populating vocab_freq_dict and token2id from glove vocab.
max_id = max(token2id.items(), key=operator.itemgetter(1))[0]
max_token_id = token2id[max_id]
with open(embedding_name, encoding="utf8", errors='ignore') as f:
for o in f:
token, *vector = o.split(' ')
token = str.lower(token)
if len(o) <= 100:
continue
if token not in token2id:
max_token_id += 1
token2id.update({token:max_token_id})
vocab_freq_dict.update({token:1})
with open("vocab_freq_dict","wb") as vocab_file:
pickle.dump(vocab_freq_dict, vocab_file)
with open("token2id", "wb") as token2id_file:
pickle.dump(token2id, token2id_file)
# converting vectors to keyedvectors format for gensim
vectors = load_vectors(token2id, embedding_name)
vec = KeyedVectors(300)
vec.add(list(token2id.keys()), vectors, replace=True)
# setting vectors(numpy_array) to None to release memory
vectors = None
params = dict(min_count=1,workers=14,iter=6,size=300)
model = Word2Vec(**params)
# using build from vocab to build the vocab
model.build_vocab_from_freq(vocab_freq_dict)
# using token2id to create idxmap
idxmap = np.array([token2id[w] for w in model.wv.index2entity])
# Setting hidden weights(syn0 = between input layer and hidden layer) = your vectors arranged accoring to ids
model.wv.vectors[:] = vec.vectors[idxmap]
# Setting hidden weights(syn0 = between hidden layer and output layer) = your vectors arranged accoring to ids
model.trainables.syn1neg[:] = vec.vectors[idxmap]
model.train(training_examples, total_examples=len(training_examples), epochs=model.epochs)
output_path = 'model_dir/final_model.model'
model.save(output_path)

Resources