Python scikit svm "Vocabulary not fitted or provided" - scikit-learn

Playing around with Python's scikit SVM Linear Support Vector Classification and I'm running into an error when I attempt to make predictions:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import string
# Function to pass the list to the Tf-idf vectorizer
def returnPhrase(inputList):
return inputList
# Pre-processing the sentence which we input to predict the emotion
def transformSentence(sentence):
s = []
sentence = sentence.replace('\n', '')
sentTokenized = word_tokenize(sentence)
s.append(sentTokenized)
sWithoutPunct = []
punctList = list(string.punctuation)
curSentList = s[0]
newSentList = []
for word in curSentList:
if word.lower() not in punctList:
newSentList.append(word.lower())
sWithoutPunct.append(newSentList)
mystemmer = PorterStemmer()
tokenziedStemmed = []
for i in range(0, len(sWithoutPunct)):
curList = sWithoutPunct[i]
newList = []
for word in curList:
newList.append(mystemmer.stem(word))
tokenziedStemmed.append(newList)
return tokenziedStemmed
# Extracting the features for SVM
myVectorizer = TfidfVectorizer(analyzer='word', tokenizer=returnPhrase, preprocessor=returnPhrase,
token_pattern=None,
ngram_range=(1, 3))
# The SVM Model
curC = 2 # cost factor in SVM
SVMClassifier = svm.LinearSVC(C=curC)
filename = 'finalized_model.sav'
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
# Input sentence
with open('trial_truth_001.txt', 'r') as file:
sent = file.read().replace('\n', '')
transformedTest = transformSentence(sent)
X_test = myVectorizer.transform(transformedTest).toarray()
Prediction = loaded_model.predict(X_test)
# Printing the predicted emotion
print(Prediction)
It's when I attempt to use the LinearSVC to predict that I'm informed:
sklearn.exceptions.NotFittedError: Vocabulary not fitted or provided
What am I missing here? Obviously it is the way I fit and transform the data.

I think you just have to change the line
X_test = myVectorizer.transform(transformedTest).toarray()
to
X_test = myVectorizer.fit_transform(transformedTest).toarray()

Related

Multi-Class Text Classification with BERT null prediction error

I am new to multi-class text classification with BERT. I have been following a tutorial (https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a) for leaning purposes.
I am able to get the script below running up to calculating the confusion matrix. The classification report also does not work. I would be grateful if someone can help me. My apologies if this question has already been asked. I searched everywhere and could not find an answer.
The error is here: y_predicted = numpy.argmax(predicted_raw, axis = 1). The error message says "axis 1 is out of bounds for array of dimension 1" When I change axis to zero. The new error message is "Singleton array 0 cannot be considered a valid collection." I think what the axis=0 error says is that y_predicted is null. I double checked it with an if statement.
import pandas
import numpy
import re
import nltk
# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
input_dataframe = pandas.read_csv('tutorial6.csv')
fig, ax = plt.subplots()
fig.suptitle("Product", fontsize=12)
input_dataframe["Product"].reset_index().groupby("Product").count().sort_values(by=
"index").plot(kind="barh", legend=False,
ax=ax).grid(axis='x')
plt.show()
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
## clean (convert to lowercase and remove punctuations and characters and then strip)
text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
## Tokenize (convert from string to list)
lst_text = text.split()
## remove Stopwords
if lst_stopwords is not None:
lst_text = [word for word in lst_text if word not in
lst_stopwords]
## Stemming (remove -ing, -ly, ...)
if flg_stemm == True:
ps = nltk.stem.porter.PorterStemmer()
lst_text = [ps.stem(word) for word in lst_text]
## Lemmatisation (convert the word into root word)
if flg_lemm == True:
lem = nltk.stem.wordnet.WordNetLemmatizer()
lst_text = [lem.lemmatize(word) for word in lst_text]
## back to string from list
text = " ".join(lst_text)
return text
lst_stopwords = nltk.corpus.stopwords.words("english")
input_dataframe["text_clean"] = input_dataframe ["Consumer_Complaint"].apply(lambda x:
utils_preprocess_text(x, flg_stemm=False, flg_lemm=True,
lst_stopwords=lst_stopwords))
from tensorflow.keras.utils import to_categorical
possible_labels = input_dataframe.Product.unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
label_dict[possible_label] = index
print(label_dict)
input_dataframe['label'] = input_dataframe.Product.replace(label_dict)
# Split into train and test - stratify over Issue
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(input_dataframe, test_size = 0.2,stratify = input_dataframe[["label"]])
# Load Huggingface transformers
from transformers import TFBertModel, BertConfig, BertTokenizerFast
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
### --------- Setup BERT ---------- ###
# Name of the BERT model to use
model_name = 'bert-base-uncased'
# Max length of tokens
max_length = 100
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)
### ------- Build the model ------- ###
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
# Then build your model output
product = Dense(8, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
outputs = {'product': product}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
# Take a look at the model
model.summary()
# Set an optimizer
optimizer = Adam()
# Set loss and metrics
loss = {'product': CategoricalCrossentropy(from_logits = True)}
metric = {'product': CategoricalAccuracy('accuracy')}
# Compile the model
model.compile(
optimizer = optimizer,
loss = loss,
metrics = metric)
# Ready output data for the model
y_train = to_categorical(data_train['label'],8)
y_test = to_categorical(data_test['label'],8)
x_train = tokenizer(
text=data_train['Consumer_Complaint'].to_list(),
add_special_tokens=True,
max_length=max_length,
truncation=True,
padding=True,
return_tensors='tf',
return_token_type_ids = False,
return_attention_mask = False,
verbose = True)
x_test = tokenizer(
text=data_test['Consumer_Complaint'].to_list(),
add_special_tokens=True,
max_length=max_length,
truncation=True,
padding=True,
return_tensors='tf',
return_token_type_ids = False,
return_attention_mask = False,
verbose = True)
# Fit the model
history = model.fit(
x={'input_ids': x_train['input_ids']},
y={'product': y_train},
validation_split=0.2,
batch_size=64,
epochs=1)
### ----- Evaluate the model ------ ###
model_eval = model.evaluate(
x={'input_ids': x_test['input_ids']},
y={'product': y_test}
)
print("This is evaluation: ", model_eval)
accr = model.evaluate(x_test['input_ids'],y_test)
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0],accr[1]))
from matplotlib import pyplot as plt
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();
# plot loss and accuracy
metrics = [k for k in history.history.keys() if ("loss" not in k) and ("val" not in k)]
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
ax[0].set(title="Training")
ax11 = ax[0].twinx()
ax[0].plot(history.history['loss'], color='black')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss', color='black')
for metric in metrics:
ax11.plot(history.history[metric], label=metric)
ax11.set_ylabel("Score", color='steelblue')
ax11.legend()
ax[1].set(title="Validation")
ax22 = ax[1].twinx()
ax[1].plot(history.history['val_loss'], color='black')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss', color='black')
for metric in metrics:
ax22.plot(history.history['val_'+metric], label=metric)
ax22.set_ylabel("Score", color="steelblue")
plt.show()
#Testing our model on the test data.
predicted_raw = model.predict({'input_ids':x_test['input_ids']})
print(type(predicted_raw))
predicted_raw=list(predicted_raw)
predicted_raw=numpy.array(predicted_raw)
y_predicted = numpy.argmax(predicted_raw, axis = 1)
y_true = data_test.label
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
confusionmatrix = confusion_matrix(y_predicted,y_true)
I am trying to get the confusion matrix and classification report working.

How to calculate Precision,Recall and F1 score using sklearn

I am trying to calculate the Precision, Recall and F1 in this sample code. I have calculated the accuracy of the model on train and test dataset. Kindly help to calculate these matrices.
Please look at the code I have comment every important line for an explanation.
# develop a classifier for the Faces Dataset
from numpy import load
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
import pickle
# load dataset
data = load('faces-embeddings.npz')
trainX, trainy, testX, testy = data['arr_0'], data['arr_1'], data['arr_2'], data['arr_3']
print('Dataset: train=%d, test=%d' % (trainX.shape[0], testX.shape[0]))
# normalize input vectors
in_encoder = Normalizer(norm='l2')
trainX = in_encoder.transform(trainX)
testX = in_encoder.transform(testX)
# label encode targets
out_encoder = LabelEncoder()
out_encoder.fit(trainy)
trainy = out_encoder.transform(trainy)
testy = out_encoder.transform(testy)
# fit model
model = SVC(kernel='linear', probability=True)
model.fit(trainX, trainy)
#Saving Model
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
# predict
yhat_train = model.predict(trainX)
yhat_test = model.predict(testX)
# score
score_train = accuracy_score(trainy, yhat_train)
score_test = accuracy_score(testy, yhat_test)
# summarize
print('Accuracy: train=%.3f, test=%.3f' % (score_train*100, score_test*100))
knowing the true value of Y (trainy here) and the predicted value of Y (yhat_train here) you can directly compute the precision, recall and F1 score, exactly as you did for the accuracy (thanks to sklearn.metrics):
sklearn.metrics.precision_score(trainy,yhat_train)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score
sklearn.metrics.recall_score(trainy,yhat_train)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score
sklearn.metrics.f1_score(trainy,yhat_train)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score

How do I set up a custom input-pipeline for sequence classification for the huggingface transformer models?

I want to use one of the models for sequence classification provided by huggingface.It seems they are providing a function called
glue_convert_examples_to_features()
for preparing the data so that it can be input into the models.
However, it seems this conversion function only applies to the glue dataset. I can't find an easy solution to apply the conversion to my costum data. Am I overseen a prebuilt function like above ? What would be an easy way to convert my custom data with one sequence and two labels into the format the model expects ?
Huggingface added a fine-tuning with custom datasets guide that contains a lot of useful information. I was able to use the information in the IMDB sequence classification section to successfully adapt a notebook using a glue dataset with my own pandas dataframe.
from transformers import (
AutoConfig,
AutoTokenizer,
TFAutoModelForSequenceClassification,
AdamW
)
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
df = pd.read_pickle('data.pkl')
train_texts = df.text.values # an array of strings
train_labels = df.label.values # an array of integers
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
train_encodings = tokenizer(train_texts.tolist(), truncation=True, max_length=96, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, max_length=96, padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((
dict(train_encodings),
train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
dict(val_encodings),
val_labels
))
num_labels = 3
num_train_examples = len(train_dataset)
num_dev_examples = len(val_dataset)
train_dataset = train_dataset.shuffle(100).batch(train_batch_size)
val_dataset = val_dataset.shuffle(100).batch(eval_batch_size)
learning_rate = 2e-5
train_batch_size = 8
eval_batch_size = 8
num_epochs = 1
train_steps_per_epoch = int(num_train_examples / train_batch_size)
dev_steps_per_epoch = int(num_dev_examples / eval_batch_size)
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=config)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
history = model.fit(train_dataset,
epochs=num_epochs,
steps_per_epoch=train_steps_per_epoch,
validation_data=val_dataset,
validation_steps=dev_steps_per_epoch)
Notebook credits: digitalepidemiologylab covid-twitter-bert colab

Python3: Multi-label text classification with reuters 21578 data set

I am using the following code to classify a document in to three categories Sports, Politics and money. I can see that this code calculates Precision recall and F1. But I am not able to find a way to use this code to test against custom document a predict its label.
from nltk.corpus import stopwords, reuters
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
cachedStopWords = stopwords.words("english")
def tokenize(text):
min_length = 3
words = map(lambda word: word.lower(), word_tokenize(text))
words = [word for word in words if word not in cachedStopWords]
tokens = (list(map(lambda token: PorterStemmer().stem(token),words)))
p = re.compile('[a-zA-Z]+');
filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length,tokens))
return filtered_tokens
def represent(documents, representer):
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
# Learn and transform train documents
vectorised_train_documents = representer.fit_transform(train_docs)
vectorised_test_documents = representer.transform(test_docs)
# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])
return (vectorised_train_documents, train_labels, vectorised_test_documents, test_labels)
def evaluate(test_labels, predictions):
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
documents = reuters.fileids()
candidate = {'representer': TfidfVectorizer(tokenizer=tokenize),
'estimator': OneVsRestClassifier(LinearSVC(random_state=42))}
train_docs, train_labels, test_docs, test_labels = represent(documents, candidate['representer'])
candidate['estimator'].fit(train_docs, train_labels)
predictions = candidate['estimator'].predict(test_docs)
evaluate(test_labels, predictions)
Credits:
https://github.com/miguelmalvarez/reuters-tc/blob/master/notebook/Classification_Reuters.ipynb
You can store your custom documents as text files in a folder, lets say yourfolder. After that you can use the below code to train on reuters data and predict labels for your text documents. all_labels will contain the list of predicted labels (as tuples) for each document
import os
classifier=OneVsRestClassifier(LinearSVC(random_state=42))
vectorizer=TfidfVectorizer(tokenizer=tokenize)
#LOAD AND TRANSFORM TRAINING DOCS
documents = reuters.fileids()
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
vectorised_train_documents = vectorizer.fit_transform(train_docs)
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
#LEARN CLASSIFICATION MODEL
classifier=classifier.fit(vectorised_train_documents, train_labels)
#LOAD AND TRANSFORM TEST DOCS
documents_yours=os.listdir('yourfoldername')
test_docs_yours = [open('yourfoldername/'+doc_id).read() for doc_id in documents_yours]
vectorised_test_documents_yours = vectorizer.transform(test_docs_yours)
#MAKEPREIDCTIONS
predictions_yours=classifier.predict(vectorised_test_documents_yours)
all_labels = mlb.inverse_transform(predictions_yours)
all_labels

Confusion Matrix - Testing Sentiment Analysis Model

I am testing a Sentiment Analysis model using NLTK. I need to add a Confusion Matrix to the classifier results and if possible also Precision, Recall and F-Measure values. I have only accuracy so far. Movie_reviews data has pos and neg labels. However to train the classifier I am using "featuresets" that has a different format from the usual (sentence, label) structure. I am not sure if I can use confusion_matrix from sklearn, after training the classifier by "featuresets"
import nltk
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
First you can classify all test values and store predicted outcomes and gold results in a list.
Then, you can use nltk.ConfusionMatrix.
test_result = []
gold_result = []
for i in range(len(testing_set)):
test_result.append(classifier.classify(testing_set[i][0]))
gold_result.append(testing_set[i][1])
Now, You can calculate different metrics.
CM = nltk.ConfusionMatrix(gold_result, test_result)
print(CM)
print("Naive Bayes Algo accuracy percent:"+str((nltk.classify.accuracy(classifier, testing_set))*100)+"\n")
labels = {'pos', 'neg'}
from collections import Counter
TP, FN, FP = Counter(), Counter(), Counter()
for i in labels:
for j in labels:
if i == j:
TP[i] += int(CM[i,j])
else:
FN[i] += int(CM[i,j])
FP[j] += int(CM[i,j])
print("label\tprecision\trecall\tf_measure")
for label in sorted(labels):
precision, recall = 0, 0
if TP[label] == 0:
f_measure = 0
else:
precision = float(TP[label]) / (TP[label]+FP[label])
recall = float(TP[label]) / (TP[label]+FN[label])
f_measure = float(2) * (precision * recall) / (precision + recall)
print(label+"\t"+str(precision)+"\t"+str(recall)+"\t"+str(f_measure))
You can check - how to calculate precision and recall here.
You can also use : sklearn.metrics for these calculations using gold_result and test_result values.
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print '\nClasification report:\n', classification_report(gold_result, test_result)
print '\nConfussion matrix:\n',confusion_matrix(gold_result, test_result)

Resources