SVR hyperparameter selection and visualisation - scikit-learn

I am just a beginner in data analysis. I want to use 'Cross-validation Grid Search method" to determine the parameters gamma and C of the Radial Basis Function (RBF) kernel SVM. I don't know where I should put my data on this code, and what data type I should use (training or target data)?
For SVR
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error,explained_variance_score
from TwoStageTrAdaBoostR2 import TwoStageTrAdaBoostR2 # import the two-stage algorithm
from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from matplotlib.colors import Normalize
from sklearn.svm import SVC
# Data import (source)
source= pd.read_csv(sourcedata)
# Data import (target)
data= pd.read_csv(targetdata)
# Sample Size
datatrain = data.sample(n=60, random_state=1)
datatest = data[~dataL.index.isin(data.index)]
# Merge training set data (source and target)
train = pd.concat([source, datatrain], sort=False)
train.reset_index(inplace=True, drop=True)
datatest.reset_index(inplace=True, drop=True)
# Variable input
X_train, y_train = train[['x1', 'x2']].values, train['y'].values
X_test, y_test = FL[['x1', 'x2']].values, FL['y'].values
# Parameter setting
#sample_size = [n_source1+n_source2+n_source3+n_source4+n_source5, n_target_train]
n_estimators = 100
steps = 8
fold = 5
random_state = np.random.RandomState(1)
sample_size = [350, 60]
#1 twostage tradaboost.r2
regr_1 = TwoStageTrAdaBoostR2(SVR(C=50, gamma='auto'),
n_estimators = n_estimators, sample_size = sample_size,
steps = steps, fold = fold,
random_state = random_state)
regr_1.fit(X_train, y_train)
y_pred1 = regr_1.predict(X_test)
print("MSE of regular two stage trAdaboostR2--model1:",sqrt(mean_squared_error(y_test, y_pred1)))
#Plot the results
plt.figure()
plt.scatter(y_test, y_test-y_pred1, c="black", label="TwoStageTrAdaBoostR2_model1", s=10)
plt.xlabel("CAR")
plt.ylabel("Err")
plt.title("Two-stage Transfer Learning Boosted Decision Tree Regression", loc='left', fontsize=12, fontweight=0, color="orange")
plt.legend()
plt.show()
for cross-validation grid search methods(best parameters):
# Cross validation grid search (best parameters)
parameter_candidates = [
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, c=5 ,n_jobs=-1)
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
For visualization of parameter effects
c_range = np.logspace(-2, 2, 4)
gamma_range = np.logspace(-2, 2, 5)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
{'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]
svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=2,n_jobs=-1,
scoring='explained_variance')
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
len(c_range))
# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
len(c_range))
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(c_range)), c_range)
plt.title('Validation accuracy')
plt.show()
When I used this code, I found the following output Heatmap plot!
But I am trying to get a Heatmap like this one

The following code with some typical regression data should work all the way through:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV,train_test_split
from matplotlib.colors import Normalize
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
X, y = datasets.load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
# Cross validation grid search (best parameters)
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
{'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]
svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=20,n_jobs=-4,cv=4,
scoring='explained_variance')
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
len(c_range))
# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
len(c_range))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=-.2, midpoint=0.5))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)),
[np.format_float_scientific(i,1) for i in gamma_range],rotation=45)
plt.yticks(np.arange(len(c_range)),
[np.format_float_scientific(i,) for i in c_range])
plt.title('Validation accuracy')
plt.show()
The granularity of the grid is very low but it takes some time run otherwise. Also the limits of the grid will need to be more educated that the ones I chose.
I'm not sure why you get the error you get but I kept things simple and initiated the SVR once in my snippet so you can see how it works. I've also used different lengths for the C and gamma arrays that's just to show how these parameters are carried through. Sometimes I find that if everything has the same length is difficult to see which parameter is responsible for what.
The final plot looks like that but this depends heavily on the range of the grid, its granularity and the dataset that you are working with. Also note that I change the parameters of the MidpointNormalize class you provided.

Related

Is it normal for the r2 score to drop after the crossvalidation process? (Linear regression model)

Car price prediction code:
# importing pandas
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score,mean_squared_error,mean_squared_log_error,make_scorer
import warnings
import os
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
is_file=os.path.isfile('C:/Users/Desktop/car_price_prediction/audi-bmw-toyota.csv')
if is_file==False:
# merging two csv files
df = pd.concat(map(pd.read_csv, ['audi.csv', 'bmw.csv','toyota.csv']), ignore_index=True)
df.to_csv("audi-bmw-toyota.csv", index=False)
print('csv files have merged...')
else:
print("The file already exists.")
# loading the dataset to a pandas DataFrame
dataset = pd.read_csv('C:/Users/Desktop/car_price_prediction/audi-bmw-toyota.csv')
print('transmission_value-counts:\n',dataset['transmission'].value_counts()) #4 type
print('fueltype_value-counts:\n',dataset['fuelType'].value_counts()) #5 type
print("Manual transmissioned cars:\n", dataset.loc[dataset['transmission'] == 'Manual'])
print("Hybrid fuel typed cars:\n", dataset.loc[dataset['fuelType'] == 'Hybrid'])
print("Other fuel typed cars:\n", dataset.loc[dataset['fuelType'] == 'Other'])
dataset.info()
# correlation
df_corr = dataset.corr()
df_corr.sort_values('price',inplace=True)
fig=df_corr[['price']].plot(kind='barh',color="r",figsize=(5, 5))
fig.set_xlabel('correlation')
fig.set_title("Price and Variables's Correlation")
X=dataset.iloc[:,[0,1,3,4,5,6,7,8]]
Y=dataset.iloc[:,2]
print("X:\n",X)
print("Y:\n",Y)
#data preprocessing
X["model"]=X["model"].str.replace(' ','')
print(X["model"])
le1=LabelEncoder() #model column has so much diffrent group. That's wyh LabelEncoder is useful.
X_0=le1.fit_transform(X.iloc[:,0])
X.loc[:, 0] =X_0
X["model"] = pd.DataFrame(X_0, columns=['model'])
#categorical variables
df_transmission = pd.get_dummies(dataset["transmission"]
,prefix = "transmission"
,drop_first = True) # Preventing Multicollinearity
X1 = pd.concat([X, df_transmission[['transmission_Manual', 'transmission_Other', 'transmission_Semi-Auto']]], axis=1)
df_fuelType = pd.get_dummies(dataset["fuelType"]
,prefix = "fuelType"
,drop_first = True) # Preventing Multicollinearity
X2 = pd.concat([X1, df_fuelType[['fuelType_Electric', 'fuelType_Hybrid', 'fuelType_Other','fuelType_Petrol']]], axis=1)
X3 = X2.drop(['transmission', 'fuelType',0], axis=1)
print("X3:\n",X3)
#Feature Scaling
sclr=StandardScaler()
X3=sclr.fit_transform(X3)
print("X3:\n",X3)
#training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X3,
Y,
test_size=0.2,
random_state=42)
rf=RandomForestRegressor(random_state=42)
model = rf.fit(X_train, Y_train)
y_pred=model.predict(X_test)
print("r2_score_test:",r2_score(Y_test,y_pred))
RMSE_test=np.sqrt(mean_squared_error(Y_test,model.predict(X_test)))
print("RMSE:",RMSE_test)
Cross validation and hyperparameter optimization
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, n_jobs = -1)
rf_random.fit(X_train, Y_train)
print("best_params",rf_random.best_params_)
base_model = RandomForestRegressor(n_estimators = 2000,min_samples_split=5,
min_samples_leaf=1,
max_features='sqrt',
max_depth=30,
bootstrap=True,
random_state = 42).fit(X_train, Y_train)
y_pred_base=base_model.predict(X_test)
print("r2_score_test:",r2_score(Y_test,y_pred_base))
RMSE_test_based_model=np.sqrt(mean_squared_error(Y_test,base_model.predict(X_test)))
print("RMSE_based:",RMSE_test_based_model)
the link of dataset: https://www.kaggle.com/adityadesai13/used-car-dataset-ford-and-mercedes
Hello friends, I have combined audi.csv, bmw.csv,toyota.csv files to obtain a new CSV file. In the car price estimation algorithm, the test cross-validated RMSE value is greater than the test RMSE (not validated) value. Also, r2_score drops a bit after cross-validation Is this process normal, or what exactly am I doing wrong?
Is the problem related to the regression model?
Before cross validation:
r2_score_test: 0.961865129046153
RMSE: 2293.040184587231
After cross validation:
r2_score_test: 0.9604039571043385
RMSE_based: 2336.5572047970254
fixes:
X=dataset.iloc[:,[1,3,4,5,6,7,8]] #removing of car model column
Y=dataset.iloc[:,2]
#data preprocessing
X['no_year'] = (2022) -X['year']
X.drop(['year'],axis = 1,inplace=True)
print('X:\n',X)
Before cross validation:
r2_score_test: 0.941560662538529
RMSE_test: 2838.5932576738546
After cross validation:
r2_score_based: 0.9603626850597086
RMSE_based: 2337.7746165658878

Confidence Interval from RandomForestRegressor in scikit-learn

scikit-learn has a quantile regression based confidence interval implementation for GBM (example form the docs).
Is there a reason why it doesn't provide a similar quantile based loss implementation for RandomForestRegressor?
There is an scikit-learn compatible/compliant Quantile Regression Forest implementation that can be used to generate confidence intervals here: https://github.com/zillow/quantile-forest
Setup should be as easy as:
pip install quantile-forest
Then, as an example, to generate CIs on a full dataset:
import matplotlib.pyplot as plt
import numpy as np
from quantile_forest import RandomForestQuantileRegressor
from sklearn import datasets
from sklearn.model_selection import KFold
X, y = datasets.fetch_california_housing(return_X_y=True)
qrf = RandomForestQuantileRegressor(n_estimators=100, random_state=0)
kf = KFold(n_splits=5)
kf.get_n_splits(X)
y_true = []
y_pred = []
y_pred_lower = []
y_pred_upper = []
for train_index, test_index in kf.split(X):
X_train, X_test, y_train, y_test = (
X[train_index], X[test_index], y[train_index], y[test_index]
)
qrf.set_params(max_features=X_train.shape[1] // 3)
qrf.fit(X_train, y_train)
# Get predictions at 95% prediction intervals and median.
y_pred_i = qrf.predict(X_test, quantiles=[0.025, 0.5, 0.975])
y_true = np.concatenate((y_true, y_test))
y_pred = np.concatenate((y_pred, y_pred_i[:, 1]))
y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[:, 0]))
y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[:, 2]))
fig = plt.figure(figsize=(10, 4))
y_pred_interval = y_pred_upper - y_pred_lower
sort_idx = np.argsort(y_pred_interval)
y_true = y_true[sort_idx]
y_pred_lower = y_pred_lower[sort_idx]
y_pred_upper = y_pred_upper[sort_idx]
# Center data, with the mean of the prediction interval at 0.
mean = (y_pred_lower + y_pred_upper) / 2
y_true -= mean
y_pred_lower -= mean
y_pred_upper -= mean
plt.plot(y_true, marker=".", ms=5, c="r", lw=0)
plt.fill_between(
np.arange(len(y_pred_upper)),
y_pred_lower,
y_pred_upper,
alpha=0.2,
color="gray",
)
plt.plot(np.arange(len(y)), y_pred_lower, marker="_", c="0.2", lw=0)
plt.plot(np.arange(len(y)), y_pred_upper, marker="_", c="0.2", lw=0)
plt.xlim([0, len(y)])
plt.xlabel("Ordered Samples")
plt.ylabel("Observed Values and Prediction Intervals (Centered)")
plt.show()
There seems to be contributed scikit-learn package (example copy pasted from there for RandomForestRegressor)
I had to install development version in order to have correct path to current scikit-learn by:
pip install git+git://github.com/scikit-learn-contrib/forest-confidence-interval.git
https://github.com/scikit-learn-contrib/forest-confidence-interval
Example (copy pasted from the link above):
# Regression Forest Example
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
import sklearn.model_selection as xval
from sklearn.datasets import fetch_openml
import forestci as fci
# retreive mpg data from machine learning library
mpg_data = fetch_openml('autompg')
# separate mpg data into predictors and outcome variable
mpg_X = mpg_data["data"]
mpg_y = mpg_data["target"]
# remove rows where the data is nan
not_null_sel = np.invert(
np.sum(np.isnan(mpg_data["data"]), axis=1).astype(bool))
mpg_X = mpg_X[not_null_sel]
mpg_y = mpg_y[not_null_sel]
# split mpg data into training and test set
mpg_X_train, mpg_X_test, mpg_y_train, mpg_y_test = xval.train_test_split(mpg_X, mpg_y,
test_size=0.25,
random_state=42)
# Create RandomForestRegressor
n_trees = 2000
mpg_forest = RandomForestRegressor(n_estimators=n_trees, random_state=42)
mpg_forest.fit(mpg_X_train, mpg_y_train)
mpg_y_hat = mpg_forest.predict(mpg_X_test)
# Plot predicted MPG without error bars
plt.scatter(mpg_y_test, mpg_y_hat)
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()
# Calculate the variance
mpg_V_IJ_unbiased = fci.random_forest_error(mpg_forest, mpg_X_train,
mpg_X_test)
# Plot error bars for predicted MPG using unbiased variance
plt.errorbar(mpg_y_test, mpg_y_hat, yerr=np.sqrt(mpg_V_IJ_unbiased), fmt='o')
plt.plot([5, 45], [5, 45], 'k--')
plt.xlabel('Reported MPG')
plt.ylabel('Predicted MPG')
plt.show()

GridSearchCV with Invalid parameter gamma for estimator LogisticRegression

I need to perform a grid search on the parameters listed below for a Logistic Regression classifier, using recall for scoring and cross-validation three times.
The data is in a csv file (11,1 MB), this link for download is: https://drive.google.com/file/d/1cQFp7HteaaL37CefsbMNuHqPzkINCVzs/view?usp=sharing
I have grid_values = {'gamma':[0.01, 0.1, 1, 10, 100]}
I need to apply penalty L1 e L2 in a Logistic Regression
I couldn't verify if the scores will run because I have the following error:
Invalid parameter gamma for estimator LogisticRegression. Check the list of available parameters with estimator.get_params().keys().
This is my code:
from sklearn.model_selection import train_test_split
df = pd.read_csv('fraud_data.csv')
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
def LogisticR_penalty():
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
grid_values = {'gamma':[0.01, 0.1, 1, 10, 100]}
#train de model with many parameters for "C" and penalty='l1'
lr_l1 = LogisticRegression(penalty='l1')
grid_lr_l1 = GridSearchCV(lr_l1, param_grid = grid_values, cv=3, scoring = 'recall')
grid_lr_l1.fit(X_train, y_train)
y_decision_fn_scores_recall = grid_lr_l1.decision_function(X_test)
lr_l2 = LogisticRegression(penalty='l2')
grid_lr_l2 = GridSearchCV(lr_l2, param_grid = grid_values, cv=3 , scoring = 'recall')
grid_lr_l2.fit(X_train, y_train)
y_decision_fn_scores_recall = grid_lr_l2.decision_function(X_test)
#The precision, recall, and accuracy scores for every combination
#of the parameters in param_grid are stored in cv_results_
results = pd.DataFrame()
results['l1_results'] = pd.DataFrame(grid_lr_l1.cv_results_)
results['l1_results'] = results['l2_results'].sort_values(by='mean_test_precision_score', ascending=False)
results['l2_results'] = pd.DataFrame(grid_lr_l2.cv_results_)
results['l2_results'] = results['l2_results'].sort_values(by='mean_test_precision_score', ascending=False)
return results
LogisticR_penalty()
I expected from .cv_results_, the average test scores of each parameter combination that I should be available here: mean_test_precision_score but not sure
The output is: ValueError: Invalid parameter gamma for estimator LogisticRegression. Check the list of available parameters with estimator.get_params().keys().
The error message contains the answer for your question. You can use the function estimator.get_params().keys() to see all available parameters for you estimator:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
print(lr.get_params().keys())
Output:
dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])
From scikit-learn's documentation, the LogisticRegression has no parameter gamma, but a parameter C for the regularization weight.
If you change grid_values = {'gamma':[0.01, 0.1, 1, 10, 100]} for grid_values = {'C':[0.01, 0.1, 1, 10, 100]} your code should work.
My code contained some errors the main error was using param_grid incorrectly. I had to apply L1 and L2 penalties with gamma 0.01, 0.1, 1, 10, 100. The right way to do this is:
grid_values ​​= {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10, 100]}
Then it was necessary to correct the way I was training my logistic regression and to correct the way I retrieved the scores in cv_results_ and averaged those scores.
Follow my code:
from sklearn.model_selection import train_test_split
df = pd.read_csv('fraud_data.csv')
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
def LogisticR_penalty():
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
grid_values = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10, 100]}
#train de model with many parameters for "C" and penalty='l1'
lr = LogisticRegression()
# We use GridSearchCV to find the value of the range that optimizes a given measurement metric.
grid_lr_recall = GridSearchCV(lr, param_grid = grid_values, cv=3, scoring = 'recall')
grid_lr_recall.fit(X_train, y_train)
y_decision_fn_scores_recall = grid_lr_recall.decision_function(X_test)
##The precision, recall, and accuracy scores for every combination
#of the parameters in param_grid are stored in cv_results_
CVresults = []
CVresults = pd.DataFrame(grid_lr_recall.cv_results_)
#test scores and mean of them
split_test_scores = np.vstack((CVresults['split0_test_score'], CVresults['split1_test_score'], CVresults['split2_test_score']))
mean_scores = split_test_scores.mean(axis=0).reshape(5, 2)
return mean_scores
LogisticR_penalty()

ValueError: Can only tuple-index with a MultiIndex

For a Multilabel Classification problem i am trying to plot precission and recall curve.
The sample code is taken from "https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py" under section Create multi-label data, fit, and predict.
I am trying to fit it in my code but i get below error as "ValueError: Can only tuple-index with a MultiIndex" when i try below code.
train_df.columns.values
array(['DefId', 'DefectCount', 'SprintNo', 'ReqName', 'AreaChange',
'CodeChange', 'TestSuite'], dtype=object)
Test Suite is the value to be predicted
X_train = train_df.drop("TestSuite", axis=1)
Y_train = train_df["TestSuite"]
X_test = test_df.drop("DefId", axis=1).copy()
classes --> i have hardcorded with the testsuite values
from sklearn.preprocessing import label_binarize
# Use label_binarize to be multi-label like settings
Y = label_binarize(Y_train, classes=np.array([0, 1, 2,3,4])
n_classes = Y.shape[1]
# We use OneVsRestClassifier for multi-label prediction
from sklearn.multiclass import OneVsRestClassifier
# Run classifier
classifier = OneVsRestClassifier(svm.LinearSVC(random_state=3))
classifier.fit(X_train, Y_train)
y_score = classifier.decision_function(X_test)
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import pandas as pd
# For each class
precision = dict()
recall = dict()
average_precision = dict()
#n_classes = Y.shape[1]
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(Y_train[:, i], y_score[:, i])
average_precision[i] = average_precision_score(Y_train[:, i], y_score[:, i])
Input Data -> Values has been categorised

How to use .fit when the X value is in time format

Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.2, random_state = 10)
You have to preprocess data before feeding your model. Here is a complete working example. First, let's import the required modules:
from datetime import datetime
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
Then, define the training data:
X = ['17:00','17:05', '17:10', '17:15', '17:20', '17:25']
X = np.array(X).reshape(-1, 1)
y = [1, 0, 1, 1, 0, 1]
Note, the X must be 2D array. Also, you have to convert time string values to the numerical format. One way to do it is to convert strings to timestamp using the builtin datetime module. Here is a function which will be used to transform the data:
def transform(X, y=None):
X_new = np.apply_along_axis(
lambda x: [datetime.strptime(x[0], '%H:%M').timestamp()],
axis=1,
arr=X)
return X_new
Don't forget to scale your data since SVC models require data normalization. One can easily combine all the preprocessing steps using the Pipeline:
pipeline = Pipeline(steps=[
('transformer', FunctionTransformer(transform, validate=False)),
('scaler', MinMaxScaler()),
('predictor', SVC(kernel='linear'))
])
Finally, let's fit the model:
print('Build and fit a model...')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print('Done. Score', score)

Resources