LGBMClassifier roc_auc problem using gridSearchCV and early_stopping via BaseEstimator - scikit-learn

i want to integrate an LGBMClassifier to existing code. the code call fit(X,y), while LGBMClassifier will need fit(X, y, eval_set, callbacks, eval_metric). i'm trying to encapsulate eval_set, callbacks, eval_metric in a BaseEstimator to expose a uniform api
without encapsulation, i got this working :
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV
X, y = load_breast_cancer(return_X_y=True)
print(X.shape, y.shape)
# (442, 10) (442,)
pipe = Pipeline([
('pca', PCA(5)),
('lgbm', LGBMClassifier())
])
param_grid = {
'pca__n_components': [3, 5],
'lgbm__learning_rate': [0.05, 0.1]
}
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.1)
lgbm__param_fit = {
'lgbm__eval_set' : [(X_eval, y_eval)],
'lgbm__callbacks' : [early_stopping(50, first_metric_only=True)],
'lgbm__eval_metric' : 'auc',
}
cv = StratifiedKFold(n_splits= 5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='roc_auc', n_jobs=1, verbose=1, error_score='raise')
grid.fit(X_train, y_train, **lgbm__param_fit)
print(grid.best_params_)
i try to encapsulate this logic in this class :
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier, early_stopping
class EarlyStopEstimator(BaseEstimator, ClassifierMixin):
def __init__(self, estimator=LGBMClassifier(), stopping_rounds=50, eval_frac=0.1, eval_metric='auc', **estimator_params):
self.estimator = estimator
self.set_params(**estimator_params)
self.stopping_rounds = stopping_rounds
self.eval_frac = eval_frac
self.eval_metric=eval_metric
def set_params(self, **params):
self.estimator.set_params(**params)
return self
def get_params(self, **params):
return self.estimator.get_params(**params)
def fit(self, X, y):
if self.eval_frac > 0:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=self.eval_frac)
param_eval = {
'eval_set' : [(X_eval, y_eval)],
'callbacks' : [early_stopping(self.stopping_rounds, first_metric_only=True)],
'eval_metric' :self.eval_metric,
}
else:
X_train, y_train = X, y
param_eval = {}
self.estimator.fit(X_train, y_train, **param_eval)
return self
def predict(self, X):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
that i try to run with :
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold, GridSearchCV
x, y = load_breast_cancer(return_X_y=True)
print(x.shape, y.shape)
# (442, 10) (442,)
pipe = Pipeline([
('pca', PCA(5)),
('lgbm', EarlyStopEstimator())
])
param_grid = {
'pca__n_components': [3, 5],
'lgbm__learning_rate': [0.05, 0.01]
}
cv = StratifiedKFold(n_splits= 5, shuffle=True, random_state=0)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='roc_auc', n_jobs=1, verbose=1, error_score='raise')
grid.fit(x, y) # <-- that's what i want to do
print(grid.best_params_)
this code work for the first fit, then fail at second one of 20 :
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27] valid_0's auc: 0.99619 valid_0's binary_logloss: 0.233344
Evaluated only: auc
---------------------------------------------------------------------------
AttributeError
'EarlyStopEstimator' object has no attribute 'decision_function'
[...]
During handling of the above exception, another exception occurred:
--> 716 return self.steps[-1][1].classes_
'EarlyStopEstimator' object has no attribute 'classes_'
this code work if i switch scoring='roc_auc' to scoring='neg_mean_absolute_error'. if i try to add this code to the class EarlyStopEstimator :
def decision_function(self, X):
return self.estimator.decision_function(X)
i get the error :
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27] valid_0's auc: 0.99619 valid_0's binary_logloss: 0.233344
Evaluated only: auc
---------------------------------------------------------------------------
AttributeError
'LGBMClassifier' object has no attribute 'decision_function'
[...]
During handling of the above exception, another exception occurred:
--> 716 return self.steps[-1][1].classes_
'EarlyStopEstimator' object has no attribute 'classes_'
What should i change in EarlyStopEstimator so i can use with grid.fit(x, y)

You can just set an attribute classes_ for EarlyStopEstimator to delegate out to the underlying estimator. Either
def fit(self, X, y):
...
self.estimator.fit(X_train, y_train, **param_eval)
self.classes_ = self.estimator.classes_
return self
or add
#property
def classes_(self):
return self.estimator.classes_

Related

List object not callable in SVM

I'm trying to run this SVM using stratified K fold in Python,however I keep on getting the error like below
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, zero_one_loss, confusion_matrix
import pandas as pd
import numpy as np
z = pd.read_csv('/home/User/datasets/gtzan.csv', header=0)
X = z.iloc[:, :-1]
y = z.iloc[:, -1:]
X = np.array(X)
y = np.array(y)
# Performing standard scaling
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)
# Defining the SVM with 'rbf' kernel
svc = SVC(kernel='rbf', C=100, random_state=50)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, shuffle=True)
skf = StratifiedKFold(n_splits=10, shuffle=True)
accuracy_score = []
#skf.get_n_splits(X, y)
for train_index, test_index in skf.split(X, y):
X_train, X_test = X_scaled[train_index], X_scaled[test_index]
y_train, y_test = y[train_index], y[test_index]
# Training the model
svc.fit(X_train, np.ravel(y_train))
# Prediction on test dataste
y_pred = svc.predict(X_test)
# Obtaining the accuracy scores of the model
score = accuracy_score(y_test, y_pred)
accuracy_score.append(score)
# Print the accuarcy of the svm model
print('accuracy score: %0.3f' % np.mean(accuracy_score))
however, it gives me an error like below
Traceback (most recent call last):
File "/home/User/Test_SVM.py", line 55, in <module>
score = accuracy_score(y_test, y_pred)
TypeError: 'list' object is not callable
What makes this score list uncallable and how do I fix this error?
accuracy_scoreis a list in my code and I was also calling the same list as a function, which is overriding the existing functionality of function accuarcy_score. Changed the list name to acc_score which solved the problem.

saving polynomial model , doesn't save polynomial degree

How can I deal with polynomial degree when I want to save a polynomial model, sicne this info is not being saved!
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
df = pd.DataFrame({
"a": np.random.uniform(0.0, 1.0, 1000),
"b": np.random.uniform(10.0, 14.0, 1000),
"c": np.random.uniform(100.0, 1000.0, 1000)})
def data():
X_train, X_val, y_train, y_val = train_test_split(df.iloc[:, :2].values,
df.iloc[:, 2].values,
test_size=0.2,
random_state=1340)
return X_train, X_val, y_train, y_val
X_train, X_val, y_train, y_val = data()
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
poly_reg_model = LinearRegression().fit(X_poly, y_train)
poly_model = joblib.dump(poly_reg_model, 'themodel')
y_pred = poly_reg_model.predict(poly_reg.fit_transform(X_val))
themodel = joblib.load('themodel')
Now, if I try to predict:
themodel.predict(X_val), I am receiving:
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 6 is different from 2)
I have to do:
pol_feat = PolynomialFeatures(degree=2)
themodel.predict(pol_feat.fit_transform(X_val))
in order to work.
So, how can i store this info in order to be able to use the model for prediction?
You have to pickle trained PolynomialFeatures also:
# train and pickle
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
poly_reg_model = LinearRegression().fit(X_poly, y_train)
joblib.dump(poly_reg_model, 'themodel')
joblib.dump(poly_reg, 'poilynomia_features_model')
# load and predict
poilynomia_features_model = joblib.load('poilynomia_features_model')
themodel = joblib.load('themodel')
X_val_prep = poilynomia_features_model.transform(X_val)
predictions = themodel.predict(X_val_prep)
But better will wrap all the steps in the single pipeline:
pipeline = Pipeline(steps=[('poilynomia', PolynomialFeatures()),
('lr', LinearRegression())])
pipeline.fit(X_train, y_train)
pipeline.predict(X_val)

Problem with distance.euclidean() missing 'u' and 'v'

So, i'm learning machine learning and i wanted to "create" my own classifier
but when i try to run it it says that in distance.euclidean() misses 'u' and 'v'
Can you help me find out the problem?
import random
from scipy.spatial import distance
def euc(a,b):
return distance.euclidean()
class KnnSchifoso():
def fit(self, X_train, y_train):
self.X_train = X_train
self.y_train = y_train
def predict(self, X_test):
predictions = []
for row in X_test:
label = self.closest(row)
predictions.append(label)
return predictions
def closest(self, row):
best_dist = euc(row, self.X_train[0])
best_index = 0
for i in range(1, len(self.X_train)):
dist = euc(row, self.X_train[i])
if dist < best_dist:
best_dist = dist
best_index = i
return self.y_train[best_index]
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = .5)
my_classifier = KnnSchifoso()
my_classifier.fit(X_train, y_train)
predictions = my_classifier.predict(X_test)
print (predictions)
from sklearn.metrics import accuracy_score
print (accuracy_score(y_test, predictions))
You're not passing the parameters to distance.euclidean...
def euc(a,b):
return distance.euclidean(a, b) <--- You need to pass values to the function

Feature-selection and prediction

from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
I have X and Y data.
data = load_iris()
X = data.data
Y = data.target
I would like to implement RFECV feature selection and prediction with k-fold validation approach.
code corrected from the answer # https://stackoverflow.com/users/3374996/vivek-kumar
clf = RandomForestClassifier()
kf = KFold(n_splits=2, shuffle=True, random_state=0)
estimators = [('standardize' , StandardScaler()),
('clf', clf)]
class Mypipeline(Pipeline):
#property
def coef_(self):
return self._final_estimator.coef_
#property
def feature_importances_(self):
return self._final_estimator.feature_importances_
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, cv=kf, scoring='accuracy', verbose=10)
rfecv_data = rfecv.fit(X, Y)
print ('no. of selected features =', rfecv_data.n_features_)
EDIT (for small remaining):
X_new = rfecv.transform(X)
print X_new.shape
y_predicts = cross_val_predict(clf, X_new, Y, cv=kf)
accuracy = accuracy_score(Y, y_predicts)
print ('accuracy =', accuracy)
Instead of wrapping StandardScaler and RFECV in a same pipeline, do that for StandardScaler and RandomForestClassifier and pass that pipeline to the RFECV as an estimator. In this no traininf info will be leaked.
estimators = [('standardize' , StandardScaler()),
('clf', RandomForestClassifier())]
pipeline = Pipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy')
rfecv_data = rfecv.fit(X, Y)
Update: About the error 'RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes'
Yes thats a known issue in scikit-learn pipeline. You can look at my other answer here for more details and use the new pipeline I created there.
Define a custom pipeline like this:
class Mypipeline(Pipeline):
#property
def coef_(self):
return self._final_estimator.coef_
#property
def feature_importances_(self):
return self._final_estimator.feature_importances_
And use that:
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy')
rfecv_data = rfecv.fit(X, Y)
Update 2:
#brute, For your data and code, the algorithms completes within a minute on my PC. This is the complete code I use:
import numpy as np
import glob
from sklearn.utils import resample
files = glob.glob('/home/Downloads/Untitled Folder/*')
outs = []
for fi in files:
data = np.genfromtxt(fi, delimiter='|', dtype=float)
data = data[~np.isnan(data).any(axis=1)]
data = resample(data, replace=False, n_samples=1800, random_state=0)
outs.append(data)
X = np.vstack(outs)
print X.shape
Y = np.repeat([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1800)
print Y.shape
#from sklearn.utils import shuffle
#X, Y = shuffle(X, Y, random_state=0)
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
clf = RandomForestClassifier()
kf = KFold(n_splits=10, shuffle=True, random_state=0)
estimators = [('standardize' , StandardScaler()),
('clf', RandomForestClassifier())]
class Mypipeline(Pipeline):
#property
def coef_(self):
return self._final_estimator.coef_
#property
def feature_importances_(self):
return self._final_estimator.feature_importances_
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy', verbose=10)
rfecv_data = rfecv.fit(X, Y)
print ('no. of selected features =', rfecv_data.n_features_)
Update 3: For cross_val_predict
X_new = rfecv.transform(X)
print X_new.shape
# Here change clf to pipeline,
# because RFECV has found features according to scaled data,
# which is not present when you pass clf
y_predicts = cross_val_predict(pipeline, X_new, Y, cv=kf)
accuracy = accuracy_score(Y, y_predicts)
print ('accuracy =', accuracy)
Here's how we'll do it:
Fit on the training set
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
X = data.data, Y = data.target
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=True)
# create model
clf = RandomForestClassifier()
# instantiate K-Fold
kf = KFold(n_splits=10, shuffle=True, random_state=0)
# pipeline estimators
estimators = [('standardize' , StandardScaler()),
('rfecv', RFECV(estimator=clf, cv=kf, scoring='accuracy'))]
# instantiate pipeline
pipeline = Pipeline(estimators)
# fit rfecv to train model
rfecv_model = rfecv_model = pipeline.fit(X_train, y_train)
# print number of selected features
print ('no. of selected features =', pipeline.named_steps['rfecv'].n_features_)
# print feature ranking
print ('ranking =', pipeline.named_steps['rfecv'].ranking_)
'Output':
no. of selected features = 3
ranking = [1 2 1 1]
Predict on the test set
# make predictions on the test set
predictions = rfecv_model.predict(X_test)
# evaluate the model performance using accuracy metric
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
'Output':
Accuracy: 0.9736842105263158

GridSearchCV - XGBoost - Early Stopping

i am trying to do hyperparemeter search with using scikit-learn's GridSearchCV on XGBoost. During gridsearch i'd like it to early stop, since it reduce search time drastically and (expecting to) have better results on my prediction/regression task. I am using XGBoost via its Scikit-Learn API.
model = xgb.XGBRegressor()
GridSearchCV(model, paramGrid, verbose=verbose ,fit_params={'early_stopping_rounds':42}, cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX, trainY]), n_jobs=n_jobs, iid=iid).fit(trainX,trainY)
I tried to give early stopping parameters with using fit_params, but then it throws this error which is basically because of lack of validation set which is required for early stopping:
/opt/anaconda/anaconda3/lib/python3.5/site-packages/xgboost/callback.py in callback(env=XGBoostCallbackEnv(model=<xgboost.core.Booster o...teration=4000, rank=0, evaluation_result_list=[]))
187 else:
188 assert env.cvfolds is not None
189
190 def callback(env):
191 """internal function"""
--> 192 score = env.evaluation_result_list[-1][1]
score = undefined
env.evaluation_result_list = []
193 if len(state) == 0:
194 init(env)
195 best_score = state['best_score']
196 best_iteration = state['best_iteration']
How can i apply GridSearch on XGBoost with using early_stopping_rounds?
note: model is working without gridsearch, also GridSearch works without 'fit_params={'early_stopping_rounds':42}
When using early_stopping_rounds you also have to give eval_metric and eval_set as input parameter for the fit method. Early stopping is done via calculating the error on an evaluation set. The error has to decrease every early_stopping_rounds otherwise the generation of additional trees is stopped early.
See the documentation of xgboosts fit method for details.
Here you see a minimal fully working example:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
cv = 2
trainX= [[1], [2], [3], [4], [5]]
trainY = [1, 2, 3, 4, 5]
# these are the evaluation sets
testX = trainX
testY = trainY
paramGrid = {"subsample" : [0.5, 0.8]}
fit_params={"early_stopping_rounds":42,
"eval_metric" : "mae",
"eval_set" : [[testX, testY]]}
model = xgb.XGBRegressor()
gridsearch = GridSearchCV(model, paramGrid, verbose=1 ,
fit_params=fit_params,
cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX,trainY]))
gridsearch.fit(trainX,trainY)
An update to #glao's answer and a response to #Vasim's comment/question, as of sklearn 0.21.3 (note that fit_params has been moved out of the instantiation of GridSearchCV and been moved into the fit() method; also, the import specifically pulls in the sklearn wrapper module from xgboost):
import xgboost.sklearn as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
cv = 2
trainX= [[1], [2], [3], [4], [5]]
trainY = [1, 2, 3, 4, 5]
# these are the evaluation sets
testX = trainX
testY = trainY
paramGrid = {"subsample" : [0.5, 0.8]}
fit_params={"early_stopping_rounds":42,
"eval_metric" : "mae",
"eval_set" : [[testX, testY]]}
model = xgb.XGBRegressor()
gridsearch = GridSearchCV(model, paramGrid, verbose=1,
cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX, trainY]))
gridsearch.fit(trainX, trainY, **fit_params)
Here's a solution that works in a Pipeline with GridSearchCV. The challenge occurs when you have a pipeline that is required to pre-process your training data. For example, when X is a text document and you need TFTDFVectorizer to vectorize it.
Over-ride the XGBRegressor or XGBClssifier.fit() Function
This step uses train_test_split() to select the specified number of
validation records from X for the eval_set and then passes the
remaining records along to fit().
A new parameter eval_test_size is added to .fit() to control the number of validation records. (see train_test_split test_size documenation)
**kwargs passes along any other parameters added by the user for the XGBRegressor.fit() function.
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split
class XGBRegressor_ES(XGBRegressor):
def fit(self, X, y, *, eval_test_size=None, **kwargs):
if eval_test_size is not None:
params = super(XGBRegressor, self).get_xgb_params()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=eval_test_size, random_state=params['random_state'])
eval_set = [(X_test, y_test)]
# Could add (X_train, y_train) to eval_set
# to get .eval_results() for both train and test
#eval_set = [(X_train, y_train),(X_test, y_test)]
kwargs['eval_set'] = eval_set
return super(XGBRegressor_ES, self).fit(X_train, y_train, **kwargs)
Example Usage
Below is a multistep pipeline that includes multiple transformations to X. The pipeline's fit() function passes the new evaluation parameter to the XGBRegressor_ES class above as xgbr__eval_test_size=200. In this example:
X_train contains text documents passed to the pipeline.
XGBRegressor_ES.fit() uses train_test_split() to select 200 records from X_train for the validation set and early stopping. (This could also be a percentage such as xgbr__eval_test_size=0.2)
The remaining records in X_train are passed along to XGBRegressor.fit() for the actual fit().
Early stopping may now occur after 75 rounds of unchanged boosting for each cv fold in a gridsearch.
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, f_regression
xgbr_pipe = Pipeline(steps=[('tfidf', TfidfVectorizer()),
('vt',VarianceThreshold()),
('scaler', StandardScaler()),
('Sp', SelectPercentile()),
('xgbr',XGBRegressor_ES(n_estimators=2000,
objective='reg:squarederror',
eval_metric='mae',
learning_rate=0.0001,
random_state=7)) ])
X_train = train_idxs['f_text'].values
y_train = train_idxs['Pct_Change_20'].values
Example Fitting the Pipeline:
%time xgbr_pipe.fit(X_train, y_train,
xgbr__eval_test_size=200,
xgbr__eval_metric='mae',
xgbr__early_stopping_rounds=75)
Example Fitting GridSearchCV:
learning_rate = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3]
param_grid = dict(xgbr__learning_rate=learning_rate)
grid_search = GridSearchCV(xgbr_pipe, param_grid, scoring="neg_mean_absolute_error", n_jobs=-1, cv=10)
grid_result = grid_search.fit(X_train, y_train,
xgbr__eval_test_size=200,
xgbr__eval_metric='mae',
xgbr__early_stopping_rounds=75)

Resources