I use automl for a classification problem. I obtained the following precision recall curve:
Is it possible to find the best threshold that maximizes the f-score from this curve? and how?
Using any kind of optimization mechanisms, we can improve the F1 score. In the below example I tried to reproduce with Standard Scalar optimization mechanism.
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression as lrs
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
pipeline = make_pipeline(StandardScaler(), lrs(random_state=1))
# Create training test splits using two features
#
pipeline.fit(X_train[:,[2, 13]],y_train)
probs = pipeline.predict_proba(X_test[:,[2, 13]])
fpr1, tpr1, thresholds = roc_curve(y_test, probs[:, 1], pos_label=1)
roc_auc1 = auc(fpr1, tpr1)
#
# Create training test splits using two different features
#
pipeline.fit(X_train[:,[4, 14]],y_train)
probs2 = pipeline.predict_proba(X_test[:,[4, 14]])
fpr2, tpr2, thresholds = roc_curve(y_test, probs2[:, 1], pos_label=1)
roc_auc2 = auc(fpr2, tpr2)
#
# Create training test splits using all features
#
pipeline.fit(X_train,y_train)
probs3 = pipeline.predict_proba(X_test)
fpr3, tpr3, thresholds = roc_curve(y_test, probs3[:, 1], pos_label=1)
roc_auc3 = auc(fpr3, tpr3)
fig, ax = plt.subplots(figsize=(7.5, 7.5))
plt.plot(fpr1, tpr1, label='ROC Curve 1 (AUC = %0.2f)' % (roc_auc1))
plt.plot(fpr2, tpr2, label='ROC Curve 2 (AUC = %0.2f)' % (roc_auc2))
plt.plot(fpr3, tpr3, label='ROC Curve 3 (AUC = %0.2f)' % (roc_auc3))
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Random Classifier')
plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='green', label='Perfect Classifier')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="lower right")
plt.show()
With Standard Scaler we will get the maximum in F1 score with the max of AUC.
Related
scikit-learn has a quantile regression based confidence interval implementation for GBM (example form the docs).
Is there a reason why it doesn't provide a similar quantile based loss implementation for RandomForestRegressor?
There is an scikit-learn compatible/compliant Quantile Regression Forest implementation that can be used to generate confidence intervals here: https://github.com/zillow/quantile-forest
Setup should be as easy as:
pip install quantile-forest
Then, as an example, to generate CIs on a full dataset:
import matplotlib.pyplot as plt
import numpy as np
from quantile_forest import RandomForestQuantileRegressor
from sklearn import datasets
from sklearn.model_selection import KFold
X, y = datasets.fetch_california_housing(return_X_y=True)
qrf = RandomForestQuantileRegressor(n_estimators=100, random_state=0)
kf = KFold(n_splits=5)
kf.get_n_splits(X)
y_true = []
y_pred = []
y_pred_lower = []
y_pred_upper = []
for train_index, test_index in kf.split(X):
X_train, X_test, y_train, y_test = (
X[train_index], X[test_index], y[train_index], y[test_index]
)
qrf.set_params(max_features=X_train.shape[1] // 3)
qrf.fit(X_train, y_train)
# Get predictions at 95% prediction intervals and median.
y_pred_i = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])
y_true = np.concatenate((y_true, y_test))
y_pred = np.concatenate((y_pred, y_pred_i[:, 1]))
y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[:, 0]))
y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[:, 2]))
fig = plt.figure(figsize=(10, 4))
y_pred_interval = y_pred_upper - y_pred_lower
sort_idx = np.argsort(y_pred_interval)
y_true = y_true[sort_idx]
y_pred_lower = y_pred_lower[sort_idx]
y_pred_upper = y_pred_upper[sort_idx]
# Center data, with the mean of the prediction interval at 0.
mean = (y_pred_lower + y_pred_upper) / 2
y_true -= mean
y_pred_lower -= mean
y_pred_upper -= mean
plt.plot(y_true, marker=".", ms=5, c="r", lw=0)
plt.fill_between(
np.arange(len(y_pred_upper)),
y_pred_lower,
y_pred_upper,
alpha=0.2,
color="gray",
)
plt.plot(np.arange(len(y)), y_pred_lower, marker="_", c="0.2", lw=0)
plt.plot(np.arange(len(y)), y_pred_upper, marker="_", c="0.2", lw=0)
plt.xlim([0, len(y)])
plt.xlabel("Ordered Samples")
plt.ylabel("Observed Values and Prediction Intervals (Centered)")
plt.show()
There seems to be contributed scikit-learn package (example copy pasted from there for RandomForestRegressor)
I had to install development version in order to have correct path to current scikit-learn by:
pip install git+git://github.com/scikit-learn-contrib/forest-confidence-interval.git
https://github.com/scikit-learn-contrib/forest-confidence-interval
Example (copy pasted from the link above):
# Regression Forest Example
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as xval
from sklearn.datasets import fetch_openml
import forestci as fci
# retreive mpg data from machine learning library
mpg_data = fetch_openml('autompg')
# separate mpg data into predictors and outcome variable
mpg_X = mpg_data["data"]
mpg_y = mpg_data["target"]
# remove rows where the data is nan
not_null_sel = np.invert(
np.sum(np.isnan(mpg_data["data"]), axis=1).astype(bool))
mpg_X = mpg_X[not_null_sel]
mpg_y = mpg_y[not_null_sel]
# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(mpg_X, mpg_y,
test_size=0.25,
random_state=42)
# Create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)
# Plot predicted MPG without error bars
plt.scatter(mpg_y_test, mpg_y_hat)
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
# Calculate the variance
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train,
mpg_X_test)
# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
I am trying to get an ROC curve but my code seem to be plotting a triangle instead. I think I am doing something wrong. I will appreciate any help.
model = MLPClassifier()
model.fit(X_train, Y_train)
prediction=model.predict(X_test)
fpr, tpr, thresholds = roc_curve(Y_test, prediction)
roc_auc = auc(fpr, tpr)
fpr=array([0. , 0.2473072, 1. ])
tpr=array([0. , 0.70320656, 1. ])
thresholds=array([2, 1, 0])
# image drawing
plt.figure()
#plt.title('Receiver Operating Characteristic %d iter' %iter)
plt.plot(fpr, tpr, label = 'MLP AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Can you please review and suggest what I can do. Even though I am not surprised I have a triangle because if I take a look at my fpr and tpr, there are just 3 values and I do not understand why. I expect to have more values that will result into a curve.
I saw someone has had same challenge as this but the solution did not seem to work for me as I expect the fpr and tpr to return more than 3 values.
I have used back propagation on a dataset which has 3 classes: L, B, R. I have also made a confusion matrix after making the neural network.
Actual class array:
sample_test = array([0, 1, 0, 2, 0, 2, 1, 1, 0, 1, 1, 1], dtype=int64)
Predicted class array:
yp = array([0, 1, 0, 2, 0, 2, 0, 1, 0, 1, 1, 1], dtype=int64)
Code for confusion matrix:
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
class_names = ['B','R','L']
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = [0, 1, 2]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plot_confusion_matrix(sample_test, yp, classes=class_names,
title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
plot_confusion_matrix(sample_test, yp, classes=class_names , normalize=True,
title='Normalized confusion matrix')
plt.show()
Output:
Now I want to plot ROC curve for this and calculate MAUC. I saw the documentation but cant understand properly what to do.
I will be very grateful, if anyone can help me by giving some suggestions how to do that. Thanks in advance.
The ROC is calculated per class - treat each class as the "positive" class and the other classes as the "negative" classes. Note - first you will have to use predict_proba() - to get the predicted probability per class. Something like this:
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
iris = sns.load_dataset('iris')
X = iris.drop('species',axis=1)
y = iris['species']
X_train, X_test, y_train, y_test = train_test_split(X,y)
le = preprocessing.LabelEncoder()
le.fit(y_train)
le.transform(y_train)
model = DecisionTreeClassifier(max_depth=1)
model.fit(X_train,le.transform(y_train))
predictions =pd.DataFrame(model.predict_proba(X_test),columns=list(le.inverse_transform(model.classes_)))
print(roc_auc_score((y_test == 'versicolor').astype(float), predictions['versicolor']))
Regarding your second question to calculate the Multiclass AUC, there are open source implementations for it. Specifically, such solutions implement equations 3 and 7 of Hand and Till 2001 original paper.
Take a look at this solution:
import numpy as np
import itertools
# pairwise class AUC
def a_value(y_true, y_pred_prob, zero_label=0, one_label=1):
"""
Approximates the AUC by the method described in Hand and Till 2001,
equation 3.
NB: The class labels should be in the set [0,n-1] where n = # of classes.
The class probability should be at the index of its label in the predicted
probability list.
Args:
y_true: actual labels of test data
y_pred_prob: predicted class probability
zero_label: label for positive class
one_label: label for negative class
Returns:
The A-value as a floating point.
"""
idx = np.isin(y_true, [zero_label, one_label])
labels = y_true[idx]
prob = y_pred_prob[idx, zero_label]
sorted_ranks = labels[np.argsort(prob)]
n0, n1, sum_ranks = 0, 0, 0
n0 = np.count_nonzero(sorted_ranks==zero_label)
n1 = np.count_nonzero(sorted_ranks==one_label)
sum_ranks = np.sum(np.where(sorted_ranks==zero_label)) + n0
return (sum_ranks - (n0*(n0+1)/2.0)) / float(n0 * n1) # Eqn 3 of the original paper
def MAUC(y_true, y_pred_prob, num_classes):
"""
Calculates the MAUC over a set of multi-class probabilities and
their labels. This is equation 7 in Hand and Till's 2001 paper.
NB: The class labels should be in the set [0,n-1] where n = # of classes.
The class probability should be at the index of its label in the
probability list.
Args:
y_true: actual labels of test data
y_pred_prob: predicted class probability
zero_label: label for positive class
one_label: label for negative class
num_classes (int): The number of classes in the dataset.
Returns:
The MAUC as a floating point value.
"""
# Find all pairwise comparisons of labels
class_pairs = [x for x in itertools.combinations(range(num_classes), 2)]
# Have to take average of A value with both classes acting as label 0 as this
# gives different outputs for more than 2 classes
sum_avals = 0
for pairing in class_pairs:
sum_avals += (a_value(y_true, y_pred_prob, zero_label=pairing[0], one_label=pairing[1]) +
a_value(y_true, y_pred_prob, zero_label=pairing[1], one_label=pairing[0])) / 2.0
return sum_avals * (2 / float(num_classes * (num_classes-1))) # Eqn 7 of the original paper
Credit: Pritom Saha Akash
I have gone through the documents about n_features and centers parameters in make_blobs function in SciKit. However, every explanation I've seen doesn't sound so clear to me since I am new to SciKit and Mathematics. I am wondering what do these two parameters: n_features, centers do in make_blobs function as below.
make_blobs(n_samples=50, n_features=2, centers=2, random_state=75)
Thank you in advance.
The make_blobs function is a part of sklearn.datasets.samples_generator. All methods in the package, help us to generate data samples or datasets. In machine learning, which scikit-learn all about, datasets are used to evaluate performance of machine learning models. This is an example on how to evaluate a KNN classifier:
from sklearn.datasets.samples_generator import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X, y = make_blobs(n_features=2, centers=3)
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred) * 100
print('accuracy: {}%'.format(acc))
Now, as you mentioned, n_features determined how many columns or features the generated datasets will have. In machine learning, features correspond to numerical characteristics data. For example, in Iris Dataset, there are 4 features (Sepal Length, Sepal Width, Petal Length and Petal Width) so there are 4 numerical columns in the dataset. So by increasing n_features in make_blobs, we are adding more features hence increase the complexity of generated dataset.
As for the centers, it is easier to understand by visualizing the generated dataset. I use matplotlib to help us on that:
from sklearn.datasets.samples_generator import make_blobs
import matplot
# plot 1
X, y = make_blobs(n_features=2, centers=1)
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.savefig('centers_1.png')
plt.title('centers = 1')
# plot 2
X, y = make_blobs(n_features=2, centers=2)
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title('centers = 2')
# plot 3
X, y = make_blobs(n_features=2, centers=3)
plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title('centers = 3')
plt.show()
If you run the code above you can easily see that centers corresponds to number of classes generated in the data. It uses centers as a term because samples that belong to same class, tend to gather close to a center (coordinate).
I have been trying to evaluate the performance of my one-class SVM. I have tried plotting an ROC curve using scikit-learn, and the results have been a bit bizarre.
X_train, X_test = train_test_split(compressed_dataset,test_size = 0.5,random_state = 42)
clf = OneClassSVM(nu=0.1,kernel = "rbf", gamma =0.1)
y_score = clf.fit(X_train).decision_function(X_test)
pred = clf.predict(X_train)
fpr,tpr,thresholds = roc_curve(pred,y_score)
# Plotting roc curve
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
The ROC curve I get:
Can somebody help me out with this?
What is bizzare about this plot? You fixed a single set of nu and gamma, thus your model is neither over- nor underfitting. Moving threshold (which is a ROC variable) does not lead to 100% TPR. Try out high gamma and very small nu (which upper bounds the training errors) and you will get more "typical" plots.
In my opinon, get the scores:
pred_scores = clf.score_samples(X_train)
Then, the pred_scores need to be min-max normalized before min-max normalize