Calculate recall metric when StratifiedShuffleSplit is used - python-3.x

The following method uses the KNN classifier with StratifiedShuffleSplit since I have an imbalanced dataset:
def KNN(train_x, train_y):
skf = StratifiedShuffleSplit()
scores = []
for train, test in skf.split(train_x, train_y):
clf = KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
clf.fit(train_x.loc[train], train_y.loc[train])
score = clf.score(train_x.loc[test], train_y.loc[test])
scores.append(score)
res = np.asarray(scores).mean()
print(res)
How can I modify the scores to calculate the recall and precision metrics instead of the default accuracy?
Thank you,

You need:
sklearn.metrics.recall_score(y_true, y_pred)
sklearn.metrics.precision_score(y_true, y_pred)
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
def KNN(train_x, train_y):
skf = StratifiedShuffleSplit()
scores = []
scores2 = []
for train, test in skf.split(train_x, train_y):
clf = KNeighborsClassifier(n_neighbors=2, n_jobs=-1)
clf.fit(train_x.loc[train], train_y.loc[train])
y_pred = clf.predict(train_x.loc[test]) # predict the labels of the test set
y_true = train_y.loc[test] # get the true labels of the test test
score = recall_score(y_true, y_pred) # recall estimation
score2 = precision_score(y_true, y_pred) # precision estimation
scores.append(score)
scores2.append(score2)

Related

How can the coefficients of the multiclass logistic regression model be used to predict the probabilities of class membership of observations?

I am trying to solve one problem that resembles that of Fisher's irises classification. The problem is that I can train the model on my computer, but the given model has to predict class membership on a computer where it is impossible to install python and scikit learn. I want to understand how, having received the coefficients of the logistic regression model, I can predict the belonging to a certain class without using the predict method of the model.
Using the Fisher problem as an example, I do the following.
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score
# data preparation
iris = load_iris()
data = pd.DataFrame(data=np.hstack([iris.data, iris.target[:, np.newaxis]]),
columns=iris.feature_names + ['target'])
names = data.columns
# split data
X_train, X_test, y_train, y_test = train_test_split(data[names[:-1]], data[names[-1]], random_state=42)
# train model
cls = make_pipeline(
StandardScaler(),
LogisticRegression(C=2, random_state=42)
)
cls = cls.fit(X_train.to_numpy(), y_train)
preds_train = cls.predict(X_train)
# prediction
preds_test = cls.predict(X_test)
# scores
train_score = accuracy_score(preds_train, y_train), f1_score(preds_train, y_train, average='macro') # on train data
# train_score = (0.9642857142857143, 0.9653621232568601)
test_score = accuracy_score(preds_test, y_test), f1_score(preds_test, y_test, average='macro') # on test data
# test_score = (1.0, 1.0)
# model coefficients
cls[1].coef_, cls[1].intercept_
>>> (array([[-1.13948079, 1.30623841, -2.21496793, -2.05617771],
[ 0.66515676, -0.2541143 , -0.55819748, -0.86441227],
[ 0.47432404, -1.05212411, 2.77316541, 2.92058998]]),
array([-0.35860337, 2.43929019, -2.08068682]))
Now I have the coefficients of the model. And I want to use them to make predictions.
First, I make a prediction using the predict method for the first five observations on the test sample.
preds_test = cls.predict_proba(X_test)
preds_test[0:5]
>>>array([[5.66019001e-03, 9.18455687e-01, 7.58841233e-02],
[9.75854479e-01, 2.41455095e-02, 1.10881450e-08],
[1.18780156e-09, 6.53295166e-04, 9.99346704e-01],
[6.71574900e-03, 8.14174200e-01, 1.79110051e-01],
[6.98756622e-04, 8.09096425e-01, 1.90204818e-01]])
Then I manually calculate the predictions of the class probabilities for the observations using the coefficients of the model.
# define two functions for making predictions
def logit(x, w):
return np.dot(x, w)
# from here: https://stackoverflow.com/questions/34968722/how-to-implement-the-softmax-function-in-python
def softmax(z):
assert len(z.shape) == 2
s = np.max(z, axis=1)
s = s[:, np.newaxis] # necessary step to do broadcasting
e_x = np.exp(z - s)
div = np.sum(e_x, axis=1)
div = div[:, np.newaxis] # dito
return e_x / div
n, k = X_test.shape
X_ = np.hstack((np.ones((n, 1)), X_test)) # add column with 1 for intercept
weights = np.hstack((cls[1].intercept_[:, np.newaxis], cls[1].coef_)) # create weights matrix
results = softmax(logit(X_, weights.T)) # calculate probabilities
results[0:5]
>>>array([[3.67343725e-14, 4.63938438e-06, 9.99995361e-01],
[2.81976786e-05, 8.63083152e-01, 1.36888650e-01],
[1.24572182e-22, 5.47800683e-11, 1.00000000e+00],
[3.32990060e-14, 3.08352323e-06, 9.99996916e-01],
[2.66415118e-15, 1.78252465e-06, 9.99998217e-01]])
If you compare the two results obtained (preds_test[0:5] and results[0:5]), you can see that they do not coincide at all. Please explain me what I am doing wrong and how I can use the model's coefficients to calculate predictions without using the predict method.
I forgot that a scaler was applied. If you change the code a little, then the results are the same.
scaler = StandardScaler()
scaler.fit(X_train)
X_test_transf = scaler.transform(X_test)
def logit(x, w):
return np.dot(x, w)
def softmax(z):
assert len(z.shape) == 2
s = np.max(z, axis=1)
s = s[:, np.newaxis] # necessary step to do broadcasting
e_x = np.exp(z - s)
div = np.sum(e_x, axis=1)
div = div[:, np.newaxis] # dito
return e_x / div
n, k = X_test_transf.shape
X_ = np.hstack((np.ones((n, 1)), X_test_transf))
weights = np.hstack((cls[1].intercept_[:, np.newaxis], cls[1].coef_))
results = softmax(logit(X_, weights.T))
np.allclose(preds_test, results)
>>>True
There are two values for every predict_proba. The first value is the probability of the event not occurring and the probability of the event occurring. predict_proba(X)[:,1] to get the probability of the event occurring.

How can i calculate all recall accuracy precision and f1 measure for multi class classification in BERT?

from sklearn.metrics import f1_score
def f1_score_func(preds, labels):
preds_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return f1_score(labels_flat, preds_flat, average='weighted')
def accuracy_per_class(preds, labels):
label_dict_inverse = {v: k for k, v in label_dict.items()}
preds_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
for label in np.unique(labels_flat):
y_preds = preds_flat[labels_flat==label]
y_true = labels_flat[labels_flat==label]
print(f'Class: {label_dict_inverse[label]}')
print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
Need to calculate classification report for multi class model but it gives accuracy and f1 score only
I suppose you are using the Pytorch environment. Here is the correct code to print the F1, recall and precision for each class in the dataset. If you have a trained model, load it and also the dataset to test on.
from sklearn.metrics import classification_report, confusion_matrix
val_dataset = LoadDataset('/content/val.csv')
val_loader = torch.utils.data.DataLoader(val_dataset,batch_size=51) # Load the data
model.load_state_dict(torch.load('vit-base.bin')) # Load the trained model
model.cuda() # For putting model on GPUs
with torch.no_grad():
image,target = next(iter(val_loader))
image = image.to(device)
target = target.flatten().to(device)
prediction = model(image)
prediction = prediction.argmax(dim=1).view(target.size()).cpu().numpy()
target = target.cpu().numpy()
print(classification_report(target,prediction,target_names=val_dataset.LE.classes_)) # LE is the label encoder

How to calculate Precision,Recall and F1 score using sklearn

I am trying to calculate the Precision, Recall and F1 in this sample code. I have calculated the accuracy of the model on train and test dataset. Kindly help to calculate these matrices.
Please look at the code I have comment every important line for an explanation.
# develop a classifier for the Faces Dataset
from numpy import load
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
import pickle
# load dataset
data = load('faces-embeddings.npz')
trainX, trainy, testX, testy = data['arr_0'], data['arr_1'], data['arr_2'], data['arr_3']
print('Dataset: train=%d, test=%d' % (trainX.shape[0], testX.shape[0]))
# normalize input vectors
in_encoder = Normalizer(norm='l2')
trainX = in_encoder.transform(trainX)
testX = in_encoder.transform(testX)
# label encode targets
out_encoder = LabelEncoder()
out_encoder.fit(trainy)
trainy = out_encoder.transform(trainy)
testy = out_encoder.transform(testy)
# fit model
model = SVC(kernel='linear', probability=True)
model.fit(trainX, trainy)
#Saving Model
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
# predict
yhat_train = model.predict(trainX)
yhat_test = model.predict(testX)
# score
score_train = accuracy_score(trainy, yhat_train)
score_test = accuracy_score(testy, yhat_test)
# summarize
print('Accuracy: train=%.3f, test=%.3f' % (score_train*100, score_test*100))
knowing the true value of Y (trainy here) and the predicted value of Y (yhat_train here) you can directly compute the precision, recall and F1 score, exactly as you did for the accuracy (thanks to sklearn.metrics):
sklearn.metrics.precision_score(trainy,yhat_train)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score
sklearn.metrics.recall_score(trainy,yhat_train)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score
sklearn.metrics.f1_score(trainy,yhat_train)
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score

Logistic Regression Cost = nan

I am trying to implement logistic regression model but keep getting 'nan' values as cost. I tried it with multiple data set but it gives the same result. Different sources gives slightly different implementation of gradient descent so I am not sure if the implementation of gradient is correct here. Here is full code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
class LogisticRegression:
def __init__(self, lr=0.001, n_iter=8000):
self.lr = lr
self.n_iter = n_iter
self.weights = None
"""
z is dot product of features and weights, which is then mapped to discrete values, such as between 0 and 1
"""
def sigmoid(self, z):
return 1.0/(1+np.exp(-z))
def predict(self, x_features, weights):
"""Returns 1d array of probabilities that the class label == 1"""
z = np.dot(x_features, weights)
return self.sigmoid(z)
def cost(self, x_features, labels, weights):
"""
Using Mean Absolute Error
Cost = (labels*log(predictions) + (1-labels)*log(1-predictions) ) / len(labels)
"""
observation = len(labels)
predictions = self.predict(x_features, weights)
#take the error when label = 1
class1_cost = -labels*np.log(predictions)
#take the error when label = 0
class2_cost = (1-labels)*np.log(1-predictions)
#take sum of both the cost
cost = class1_cost+class2_cost
#take the average cost
cost = cost.sum()/observation
return cost
def update_weight(self, x_features, labels, weights):
"""
Vectorized Gradient Descent
"""
N = len(x_features)
#get predictions (approximation of y)
predictions = self.predict(x_features, weights)
gradient = np.dot(x_features.T, predictions-labels)
#take the average cost of derivative for each feature
gradient /= N
#multiply gradients by our learning rate
gradient *= self.lr
#subtract from our weights to minimize cost
weights -= gradient
return weights
def give_predictions(self, x_features, weights):
linear_model_prediction = self.predict(x_features, weights)
y_predicted_cls = [1 if i>0.5 else 0 for i in linear_model_prediction]
return y_predicted_cls
def train(self, features, labels):
n_samples, n_features = features.shape
self.weights = np.zeros((n_features,1)) #initialize the weight matrix
cost_history = []
for i in range(self.n_iter):
self.weights = self.update_weight(features, labels, self.weights)
#calculate error for auditing purposes
cost = self.cost(features, labels, self.weights)
cost_history.append(cost)
#Log process
if i%1000 == 0:
print("iter: {}, cost: {}".format(str(i),str(cost)))
return self.weights, cost_history
def generate_data():
bc = datasets.load_breast_cancer()
x_features, labels = bc.data, bc.target
x_train, x_test, y_train, y_test = train_test_split(x_features, labels, test_size=0.2, random_state=1234)
return x_train, x_test, y_train, y_test
x_train, x_test, y_train, y_test = generate_data()
model = LogisticRegression()
model.train(x_train, y_train)
I had to apply feature scaling to x_train before training the model. I used sklearn StandardScaler library
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
Your cost function seems correct, but you need to have 'y' as a vector of zeros and a one (one_hot_encoding).

ValueError: Can only tuple-index with a MultiIndex

For a Multilabel Classification problem i am trying to plot precission and recall curve.
The sample code is taken from "https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py" under section Create multi-label data, fit, and predict.
I am trying to fit it in my code but i get below error as "ValueError: Can only tuple-index with a MultiIndex" when i try below code.
train_df.columns.values
array(['DefId', 'DefectCount', 'SprintNo', 'ReqName', 'AreaChange',
'CodeChange', 'TestSuite'], dtype=object)
Test Suite is the value to be predicted
X_train = train_df.drop("TestSuite", axis=1)
Y_train = train_df["TestSuite"]
X_test = test_df.drop("DefId", axis=1).copy()
classes --> i have hardcorded with the testsuite values
from sklearn.preprocessing import label_binarize
# Use label_binarize to be multi-label like settings
Y = label_binarize(Y_train, classes=np.array([0, 1, 2,3,4])
n_classes = Y.shape[1]
# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier
# Run classifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=3))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import pandas as pd
# For each class
precision = dict()
recall = dict()
average_precision = dict()
#n_classes = Y.shape[1]
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(Y_train[:, i], y_score[:, i])
average_precision[i] = average_precision_score(Y_train[:, i], y_score[:, i])
Input Data -> Values has been categorised

Resources