Confidence Interval from RandomForestRegressor in scikit-learn - python-3.x

scikit-learn has a quantile regression based confidence interval implementation for GBM (example form the docs).
Is there a reason why it doesn't provide a similar quantile based loss implementation for RandomForestRegressor?

There is an scikit-learn compatible/compliant Quantile Regression Forest implementation that can be used to generate confidence intervals here: https://github.com/zillow/quantile-forest
Setup should be as easy as:
pip install quantile-forest
Then, as an example, to generate CIs on a full dataset:
import matplotlib.pyplot as plt
import numpy as np
from quantile_forest import RandomForestQuantileRegressor
from sklearn import datasets
from sklearn.model_selection import KFold
X, y = datasets.fetch_california_housing(return_X_y=True)
qrf = RandomForestQuantileRegressor(n_estimators=100, random_state=0)
kf = KFold(n_splits=5)
kf.get_n_splits(X)
y_true = []
y_pred = []
y_pred_lower = []
y_pred_upper = []
for train_index, test_index in kf.split(X):
X_train, X_test, y_train, y_test = (
X[train_index], X[test_index], y[train_index], y[test_index]
)
qrf.set_params(max_features=X_train.shape[1] // 3)
qrf.fit(X_train, y_train)
# Get predictions at 95% prediction intervals and median.
y_pred_i = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])
y_true = np.concatenate((y_true, y_test))
y_pred = np.concatenate((y_pred, y_pred_i[:, 1]))
y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[:, 0]))
y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[:, 2]))
fig = plt.figure(figsize=(10, 4))
y_pred_interval = y_pred_upper - y_pred_lower
sort_idx = np.argsort(y_pred_interval)
y_true = y_true[sort_idx]
y_pred_lower = y_pred_lower[sort_idx]
y_pred_upper = y_pred_upper[sort_idx]
# Center data, with the mean of the prediction interval at 0.
mean = (y_pred_lower + y_pred_upper) / 2
y_true -= mean
y_pred_lower -= mean
y_pred_upper -= mean
plt.plot(y_true, marker=".", ms=5, c="r", lw=0)
plt.fill_between(
np.arange(len(y_pred_upper)),
y_pred_lower,
y_pred_upper,
alpha=0.2,
color="gray",
)
plt.plot(np.arange(len(y)), y_pred_lower, marker="_", c="0.2", lw=0)
plt.plot(np.arange(len(y)), y_pred_upper, marker="_", c="0.2", lw=0)
plt.xlim([0, len(y)])
plt.xlabel("Ordered Samples")
plt.ylabel("Observed Values and Prediction Intervals (Centered)")
plt.show()

There seems to be contributed scikit-learn package (example copy pasted from there for RandomForestRegressor)
I had to install development version in order to have correct path to current scikit-learn by:
pip install git+git://github.com/scikit-learn-contrib/forest-confidence-interval.git
https://github.com/scikit-learn-contrib/forest-confidence-interval
Example (copy pasted from the link above):
# Regression Forest Example
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as xval
from sklearn.datasets import fetch_openml
import forestci as fci
# retreive mpg data from machine learning library
mpg_data = fetch_openml('autompg')
# separate mpg data into predictors and outcome variable
mpg_X = mpg_data["data"]
mpg_y = mpg_data["target"]
# remove rows where the data is nan
not_null_sel = np.invert(
np.sum(np.isnan(mpg_data["data"]), axis=1).astype(bool))
mpg_X = mpg_X[not_null_sel]
mpg_y = mpg_y[not_null_sel]
# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(mpg_X, mpg_y,
test_size=0.25,
random_state=42)
# Create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)
# Plot predicted MPG without error bars
plt.scatter(mpg_y_test, mpg_y_hat)
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
# Calculate the variance
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train,
mpg_X_test)
# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()

Related

KNN Python implementation

this is what shows when i try running my code:
FutureWarning: Unlike other reduction functions (e.g. skew, kurtosis), the default behavior of mode typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of keepdims will become False, the axis over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set keepdims to True or False to avoid this warning.
lab = mode(labels)
This is my Python code, and i find some difficulties trying find a suited solution:
# Importing the required modules
import numpy as np
from scipy.stats import mode
# Euclidean Distance
def eucledian(p1, p2):
dist = np.sqrt(np.sum((p1 - p2) ** 2))
return dist
# Function to calculate KNN
def predict(x_train, y, x_input, k):
op_labels = []
# Loop through the Datapoints to be classified
for item in x_input:
# Array to store distances
point_dist = []
# Loop through each training Data
for j in range(len(x_train)):
distances = eucledian(np.array(x_train[j, :]), item)
# Calculating the distance
point_dist.append(distances)
point_dist = np.array(point_dist)
# Sorting the array while preserving the index
# Keeping the first K datapoints
dist = np.argsort(point_dist)[:k]
# Labels of the K datapoints from above
labels = y[dist]
** # Majority voting
lab = mode(labels)
lab = lab.mode[0]
op_labels.append(lab)**
return op_labels
# Importing the required modules
# Importing required modules
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from numpy.random import randint
# Loading the Data
iris= load_iris()
# Store features matrix in X
X= iris.data
# Store target vector in
y = iris.target
# Creating the training Data
train_idx = xxx = randint(0, 150, 100)
X_train = X[train_idx]
y_train = y[train_idx]
# Creating the testing Data
test_idx = xxx = randint(0, 150, 50) # taking 50 random samples
X_test = X[test_idx]
y_test = y[test_idx]
# Applying our function
y_pred = predict(X_train, y_train, X_test, 7)
# Checking the accuracy
accuracy_score(y_test, y_pred)
I am expecting a prediction/accuracy to be the prompt.
KNN can be done like this.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# Assign colum names to the dataset
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
# Read dataset to pandas dataframe
dataset = pd.read_csv(url, names=names)
dataset.head()
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
# Result:
precision recall f1-score support
Iris-setosa 1.00 1.00 1.00 13
Iris-versicolor 1.00 0.89 0.94 9
Iris-virginica 0.89 1.00 0.94 8
accuracy 0.97 30
macro avg 0.96 0.96 0.96 30
weighted avg 0.97 0.97 0.97 30
error = []
# Calculating error for K values between 1 and 40
for i in range(1, 40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

Residual plot for MultiOutputRegressor with yellowbrick

I am dealing with a multi-output regression problem and applied "MultiOutputRegressor" accompanied by "XGBRegressor" algorithms on the corresponding data.
import numpy as np
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
# Create a random dataset
rng = np.random.RandomState(1)
X = np.sort(200 * rng.rand(600, 1) - 100, axis=0)
y = np.array([np.pi * np.sin(X).ravel(), np.pi *
np.cos(X).ravel()]).T
y += 0.5 - rng.rand(*y.shape)
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=400, test_size=200, random_state=4)
regr_multi = MultiOutputRegressor(XGBRegressor())
regr_multi.fit(X_train, y_train)
y_pred = regr_multi.predict(X_test)
What I would like to visualize is the residual of model prediction using ResidualPlot from yellowbrick package.
When I use the following code
from yellowbrick.regressor import ResidualsPlot
vis = ResidualsPlot(regr_multi)
vis.fit(X_train, y_train)
vis.score(X_test, y_test)
vis.show()
I faced with an error mentioned The 'color' keyword argument must have one color per dataset, but 2 datasets and 1 colors were provided.
I was wondering that MultiOutput Residual plot is supported by yellowbriks or it is just an error that can be solved easily?

Is it normal for the r2 score to drop after the crossvalidation process? (Linear regression model)

Car price prediction code:
# importing pandas
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score,mean_squared_error,mean_squared_log_error,make_scorer
import warnings
import os
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
is_file=os.path.isfile('C:/Users/Desktop/car_price_prediction/audi-bmw-toyota.csv')
if is_file==False:
# merging two csv files
df = pd.concat(map(pd.read_csv, ['audi.csv', 'bmw.csv','toyota.csv']), ignore_index=True)
df.to_csv("audi-bmw-toyota.csv", index=False)
print('csv files have merged...')
else:
print("The file already exists.")
# loading the dataset to a pandas DataFrame
dataset = pd.read_csv('C:/Users/Desktop/car_price_prediction/audi-bmw-toyota.csv')
print('transmission_value-counts:\n',dataset['transmission'].value_counts()) #4 type
print('fueltype_value-counts:\n',dataset['fuelType'].value_counts()) #5 type
print("Manual transmissioned cars:\n", dataset.loc[dataset['transmission'] == 'Manual'])
print("Hybrid fuel typed cars:\n", dataset.loc[dataset['fuelType'] == 'Hybrid'])
print("Other fuel typed cars:\n", dataset.loc[dataset['fuelType'] == 'Other'])
dataset.info()
# correlation
df_corr = dataset.corr()
df_corr.sort_values('price',inplace=True)
fig=df_corr[['price']].plot(kind='barh',color="r",figsize=(5, 5))
fig.set_xlabel('correlation')
fig.set_title("Price and Variables's Correlation")
X=dataset.iloc[:,[0,1,3,4,5,6,7,8]]
Y=dataset.iloc[:,2]
print("X:\n",X)
print("Y:\n",Y)
#data preprocessing
X["model"]=X["model"].str.replace(' ','')
print(X["model"])
le1=LabelEncoder() #model column has so much diffrent group. That's wyh LabelEncoder is useful.
X_0=le1.fit_transform(X.iloc[:,0])
X.loc[:, 0] =X_0
X["model"] = pd.DataFrame(X_0, columns=['model'])
#categorical variables
df_transmission = pd.get_dummies(dataset["transmission"]
,prefix = "transmission"
,drop_first = True) # Preventing Multicollinearity
X1 = pd.concat([X, df_transmission[['transmission_Manual', 'transmission_Other', 'transmission_Semi-Auto']]], axis=1)
df_fuelType = pd.get_dummies(dataset["fuelType"]
,prefix = "fuelType"
,drop_first = True) # Preventing Multicollinearity
X2 = pd.concat([X1, df_fuelType[['fuelType_Electric', 'fuelType_Hybrid', 'fuelType_Other','fuelType_Petrol']]], axis=1)
X3 = X2.drop(['transmission', 'fuelType',0], axis=1)
print("X3:\n",X3)
#Feature Scaling
sclr=StandardScaler()
X3=sclr.fit_transform(X3)
print("X3:\n",X3)
#training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X3,
Y,
test_size=0.2,
random_state=42)
rf=RandomForestRegressor(random_state=42)
model = rf.fit(X_train, Y_train)
y_pred=model.predict(X_test)
print("r2_score_test:",r2_score(Y_test,y_pred))
RMSE_test=np.sqrt(mean_squared_error(Y_test,model.predict(X_test)))
print("RMSE:",RMSE_test)
Cross validation and hyperparameter optimization
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, n_jobs = -1)
rf_random.fit(X_train, Y_train)
print("best_params",rf_random.best_params_)
base_model = RandomForestRegressor(n_estimators = 2000,min_samples_split=5,
min_samples_leaf=1,
max_features='sqrt',
max_depth=30,
bootstrap=True,
random_state = 42).fit(X_train, Y_train)
y_pred_base=base_model.predict(X_test)
print("r2_score_test:",r2_score(Y_test,y_pred_base))
RMSE_test_based_model=np.sqrt(mean_squared_error(Y_test,base_model.predict(X_test)))
print("RMSE_based:",RMSE_test_based_model)
the link of dataset: https://www.kaggle.com/adityadesai13/used-car-dataset-ford-and-mercedes
Hello friends, I have combined audi.csv, bmw.csv,toyota.csv files to obtain a new CSV file. In the car price estimation algorithm, the test cross-validated RMSE value is greater than the test RMSE (not validated) value. Also, r2_score drops a bit after cross-validation Is this process normal, or what exactly am I doing wrong?
Is the problem related to the regression model?
Before cross validation:
r2_score_test: 0.961865129046153
RMSE: 2293.040184587231
After cross validation:
r2_score_test: 0.9604039571043385
RMSE_based: 2336.5572047970254
fixes:
X=dataset.iloc[:,[1,3,4,5,6,7,8]] #removing of car model column
Y=dataset.iloc[:,2]
#data preprocessing
X['no_year'] = (2022) -X['year']
X.drop(['year'],axis = 1,inplace=True)
print('X:\n',X)
Before cross validation:
r2_score_test: 0.941560662538529
RMSE_test: 2838.5932576738546
After cross validation:
r2_score_based: 0.9603626850597086
RMSE_based: 2337.7746165658878

SVR hyperparameter selection and visualisation

I am just a beginner in data analysis. I want to use 'Cross-validation Grid Search method" to determine the parameters gamma and C of the Radial Basis Function (RBF) kernel SVM. I don't know where I should put my data on this code, and what data type I should use (training or target data)?
For SVR
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error,explained_variance_score
from TwoStageTrAdaBoostR2 import TwoStageTrAdaBoostR2 # import the two-stage algorithm
from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from matplotlib.colors import Normalize
from sklearn.svm import SVC
# Data import (source)
source= pd.read_csv(sourcedata)
# Data import (target)
data= pd.read_csv(targetdata)
# Sample Size
datatrain = data.sample(n=60, random_state=1)
datatest = data[~dataL.index.isin(data.index)]
# Merge training set data (source and target)
train = pd.concat([source, datatrain], sort=False)
train.reset_index(inplace=True, drop=True)
datatest.reset_index(inplace=True, drop=True)
# Variable input
X_train, y_train = train[['x1', 'x2']].values, train['y'].values
X_test, y_test = FL[['x1', 'x2']].values, FL['y'].values
# Parameter setting
#sample_size = [n_source1+n_source2+n_source3+n_source4+n_source5, n_target_train]
n_estimators = 100
steps = 8
fold = 5
random_state = np.random.RandomState(1)
sample_size = [350, 60]
#1 twostage tradaboost.r2
regr_1 = TwoStageTrAdaBoostR2(SVR(C=50, gamma='auto'),
n_estimators = n_estimators, sample_size = sample_size,
steps = steps, fold = fold,
random_state = random_state)
regr_1.fit(X_train, y_train)
y_pred1 = regr_1.predict(X_test)
print("MSE of regular two stage trAdaboostR2--model1:",sqrt(mean_squared_error(y_test, y_pred1)))
#Plot the results
plt.figure()
plt.scatter(y_test, y_test-y_pred1, c="black", label="TwoStageTrAdaBoostR2_model1", s=10)
plt.xlabel("CAR")
plt.ylabel("Err")
plt.title("Two-stage Transfer Learning Boosted Decision Tree Regression", loc='left', fontsize=12, fontweight=0, color="orange")
plt.legend()
plt.show()
for cross-validation grid search methods(best parameters):
# Cross validation grid search (best parameters)
parameter_candidates = [
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, c=5 ,n_jobs=-1)
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
For visualization of parameter effects
c_range = np.logspace(-2, 2, 4)
gamma_range = np.logspace(-2, 2, 5)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
{'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]
svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=2,n_jobs=-1,
scoring='explained_variance')
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
len(c_range))
# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
len(c_range))
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(c_range)), c_range)
plt.title('Validation accuracy')
plt.show()
When I used this code, I found the following output Heatmap plot!
But I am trying to get a Heatmap like this one
The following code with some typical regression data should work all the way through:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV,train_test_split
from matplotlib.colors import Normalize
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
X, y = datasets.load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
# Cross validation grid search (best parameters)
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
{'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]
svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=20,n_jobs=-4,cv=4,
scoring='explained_variance')
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
len(c_range))
# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
len(c_range))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=-.2, midpoint=0.5))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)),
[np.format_float_scientific(i,1) for i in gamma_range],rotation=45)
plt.yticks(np.arange(len(c_range)),
[np.format_float_scientific(i,) for i in c_range])
plt.title('Validation accuracy')
plt.show()
The granularity of the grid is very low but it takes some time run otherwise. Also the limits of the grid will need to be more educated that the ones I chose.
I'm not sure why you get the error you get but I kept things simple and initiated the SVR once in my snippet so you can see how it works. I've also used different lengths for the C and gamma arrays that's just to show how these parameters are carried through. Sometimes I find that if everything has the same length is difficult to see which parameter is responsible for what.
The final plot looks like that but this depends heavily on the range of the grid, its granularity and the dataset that you are working with. Also note that I change the parameters of the MidpointNormalize class you provided.

ValueError: Can only tuple-index with a MultiIndex

For a Multilabel Classification problem i am trying to plot precission and recall curve.
The sample code is taken from "https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py" under section Create multi-label data, fit, and predict.
I am trying to fit it in my code but i get below error as "ValueError: Can only tuple-index with a MultiIndex" when i try below code.
train_df.columns.values
array(['DefId', 'DefectCount', 'SprintNo', 'ReqName', 'AreaChange',
'CodeChange', 'TestSuite'], dtype=object)
Test Suite is the value to be predicted
X_train = train_df.drop("TestSuite", axis=1)
Y_train = train_df["TestSuite"]
X_test = test_df.drop("DefId", axis=1).copy()
classes --> i have hardcorded with the testsuite values
from sklearn.preprocessing import label_binarize
# Use label_binarize to be multi-label like settings
Y = label_binarize(Y_train, classes=np.array([0, 1, 2,3,4])
n_classes = Y.shape[1]
# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier
# Run classifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=3))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import pandas as pd
# For each class
precision = dict()
recall = dict()
average_precision = dict()
#n_classes = Y.shape[1]
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(Y_train[:, i], y_score[:, i])
average_precision[i] = average_precision_score(Y_train[:, i], y_score[:, i])
Input Data -> Values has been categorised

Resources