Grid Search CV invalid parameter error for decision tree - decision-tree

I have been trying to do grid search CV in case of decision trees. I am getting the following error.
ValueError: Invalid parameter min_samples_split for estimator RecursiveTabularRegressionForecaster(estimator=DecisionTreeRegressor()). Check the list of available parameters with estimator.get_params().keys().
from sktime.forecasting.model_selection import ForecastingRandomizedSearchCV
from sktime.forecasting.model_selection import SlidingWindowSplitter
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.compose import make_reduction
y = univ['GHI']
y_train, y_test = temporal_train_test_split(y, test_size=30)
regressor = DecisionTreeRegressor()
forecaster = make_reduction(regressor)
params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
cv = SlidingWindowSplitter(initial_window=60, window_length=30)
nrcv = ForecastingRandomizedSearchCV(forecaster, strategy="refit", cv=cv,
param_distributions=params,
n_iter=5, random_state=42)
nrcv.fit(y_train)
y_pred = nrcv.predict(np.arange(1, y_test.size+1))
print(nrcv.best_params_)
print(nrcv.best_score_)
I have also tried 'estimator.get_params().keys()' this function. and I think its matching with the parameters.
dict_keys(['ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

Related

Value errors in KerasRegressor using GridSearhCV

I need to find the best parameters that will ultimately inform the best price for a certain number automobile characteristics. The *csv file is attached, so as you can have the exact picture of the problems: DataAutomobile.csv
When I compile, I get the following error:
File "C:\Users\jetta\anaconda3\lib\site-packages\scikeras\wrappers.py", line 1127, in set_params
raise ValueError(
ValueError: Invalid parameter activ1 for estimator KerasRegressor. This issue can likely be resolved by setting this parameter in the KerasRegressor constructor:KerasRegressor(activ1=relu). Check the list of available parameters with estimator.get_params().keys()
The other messages if I force activ1 = relu. Below is just a sample of several similar errors
ValueError: Invalid parameter neurons1 for estimator KerasRegressor. This issue can likely be resolved by setting this parameter in the KerasRegressor constructor:KerasRegressor(neurons1=100)
Code:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import backend as k
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from scikeras.wrappers import KerasRegressor
base = pd.read_csv('autos.csv', encoding = 'ISO-8859-1')
base = base.drop('dateCrawled', axis = 1)
base = base.drop('dateCreated', axis = 1)
base = base.drop('nrOfPictures', axis = 1)
base = base.drop('postalCode', axis = 1)
base = base.drop('lastSeen', axis = 1)
base = base.drop('name', axis = 1)
base = base.drop('seller', axis = 1)
base = base.drop('offerType', axis = 1)
base = base[base.price > 10]
base = base.loc[base.price < 350000]
auto_values = {'vehicleType': 'limousine', 'gearbox': 'manuell',
'model': 'golf', 'fuelType': 'benzin',
'notRepairedDamage': 'nein'}
base = base.fillna(value = auto_values)
auto_predictors = base.iloc[:, 1:13].values
auto_RealPrice = base.iloc[:, 0].values
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0,1,3,5,8,9,10])],remainder='passthrough')
auto_predictors = onehotencoder.fit_transform(auto_predictors).toarray()
def auto_CreateNeural(optimizer, loss, kernel_initializer, activation, activ1, activ2, activ3, neurons1, neurons2, neurons3, kernel_ini1, kernel_ini2, kernel_ini3):
k.clear_session()
regressor = Sequential([
tf.keras.layers.Dense(units = neurons1, activation = activ1, kernel_initializer = kernel_ini1, input_dim=316),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units = neurons2, activation = activ2, kernel_initializer = kernel_ini2),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units = neurons3, activation = activ3, kernel_initializer = kernel_ini3),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(units = 1, activation = 'linear')])
regressor.compile(optimizer = optimizer, loss = loss, kernel_initializer = ['kernel_ini1', 'kernel_ini2', 'kernel_ini3'], activation = ['activ1', 'activ2', 'activ3'],
units = ['neurons1', 'neurons2', 'neurons3'], metrics = ['binary_accuracy'])
return regressor
regressor = KerasRegressor(build_fn = auto_CreateNeural, verbose=0)
auto_parameters = {'batch_size': [350, 500],
'epochs': [3, 5],
'optimizer': ['adam', 'Adamax'],
'loss': ['kullback_leibler_divergence','binary_crossentropy', 'hinge', 'mean_absolute_error'],
'kernel_ini1': ['random_uniform', 'normal', 'glorot_uniform'],
'kernel_ini2': ['normal', 'glorot_uniform'],
'kernel_ini3': ['random_uniform', 'normal'],
'activ1': ['relu', 'elu', 'tanh', 'softmax'],
'activ2': ['tanh', 'softmax'],
'activ3': ['elu', 'softmax'],
'neurons1': [100, 158, 200, 316],
'neurons2': [80, 115, 158, 225, 330],
'neurons3': [70, 90, 120, 158, 250]}
grid_search = GridSearchCV(estimator = regressor,
param_grid = auto_parameters,
scoring = 'neg_mean_absolute_error', cv = 2)
grid_result = grid_search.fit(auto_predictors, auto_RealPrice)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
best_auto_parameters = grid_search.best_params_
best_precision = grid_search.best_score_
I am new to neural networks, and despite having read several documentations for the past days, I am completely stuck and cannot go further without help. I need to find the best values for each and all of the variables described in "auto_parameters".
What should I exactly do in the code to get it working properly? If not asking much, a brief explanation of the corrections would be really valuable.
The errors in SciKeras could use some work, but what that error is telling you is that every argument here:
def auto_CreateNeural(optimizer, loss, kernel_initializer, activation, activ1, activ2, activ3, neurons1, neurons2, neurons3, kernel_ini1, kernel_ini2, kernel_ini3)
Requires a corresponding default value when constructing KerasRegressor.
That is, you need to change:
KerasRegressor(build_fn = auto_CreateNeural, verbose=0)
To:
KerasRegressor(build_fn = auto_CreateNeural, kernel_initializer=...), activ1=..., ...)
Scikit-Learn expects estimators to work both in and out of a pipeline, without those defaults this estimator would behave differently inside and outside of a pipeline (namely, it would not work outside of a pipeline).

Is it normal for the r2 score to drop after the crossvalidation process? (Linear regression model)

Car price prediction code:
# importing pandas
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score,mean_squared_error,mean_squared_log_error,make_scorer
import warnings
import os
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
is_file=os.path.isfile('C:/Users/Desktop/car_price_prediction/audi-bmw-toyota.csv')
if is_file==False:
# merging two csv files
df = pd.concat(map(pd.read_csv, ['audi.csv', 'bmw.csv','toyota.csv']), ignore_index=True)
df.to_csv("audi-bmw-toyota.csv", index=False)
print('csv files have merged...')
else:
print("The file already exists.")
# loading the dataset to a pandas DataFrame
dataset = pd.read_csv('C:/Users/Desktop/car_price_prediction/audi-bmw-toyota.csv')
print('transmission_value-counts:\n',dataset['transmission'].value_counts()) #4 type
print('fueltype_value-counts:\n',dataset['fuelType'].value_counts()) #5 type
print("Manual transmissioned cars:\n", dataset.loc[dataset['transmission'] == 'Manual'])
print("Hybrid fuel typed cars:\n", dataset.loc[dataset['fuelType'] == 'Hybrid'])
print("Other fuel typed cars:\n", dataset.loc[dataset['fuelType'] == 'Other'])
dataset.info()
# correlation
df_corr = dataset.corr()
df_corr.sort_values('price',inplace=True)
fig=df_corr[['price']].plot(kind='barh',color="r",figsize=(5, 5))
fig.set_xlabel('correlation')
fig.set_title("Price and Variables's Correlation")
X=dataset.iloc[:,[0,1,3,4,5,6,7,8]]
Y=dataset.iloc[:,2]
print("X:\n",X)
print("Y:\n",Y)
#data preprocessing
X["model"]=X["model"].str.replace(' ','')
print(X["model"])
le1=LabelEncoder() #model column has so much diffrent group. That's wyh LabelEncoder is useful.
X_0=le1.fit_transform(X.iloc[:,0])
X.loc[:, 0] =X_0
X["model"] = pd.DataFrame(X_0, columns=['model'])
#categorical variables
df_transmission = pd.get_dummies(dataset["transmission"]
,prefix = "transmission"
,drop_first = True) # Preventing Multicollinearity
X1 = pd.concat([X, df_transmission[['transmission_Manual', 'transmission_Other', 'transmission_Semi-Auto']]], axis=1)
df_fuelType = pd.get_dummies(dataset["fuelType"]
,prefix = "fuelType"
,drop_first = True) # Preventing Multicollinearity
X2 = pd.concat([X1, df_fuelType[['fuelType_Electric', 'fuelType_Hybrid', 'fuelType_Other','fuelType_Petrol']]], axis=1)
X3 = X2.drop(['transmission', 'fuelType',0], axis=1)
print("X3:\n",X3)
#Feature Scaling
sclr=StandardScaler()
X3=sclr.fit_transform(X3)
print("X3:\n",X3)
#training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X3,
Y,
test_size=0.2,
random_state=42)
rf=RandomForestRegressor(random_state=42)
model = rf.fit(X_train, Y_train)
y_pred=model.predict(X_test)
print("r2_score_test:",r2_score(Y_test,y_pred))
RMSE_test=np.sqrt(mean_squared_error(Y_test,model.predict(X_test)))
print("RMSE:",RMSE_test)
Cross validation and hyperparameter optimization
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, n_jobs = -1)
rf_random.fit(X_train, Y_train)
print("best_params",rf_random.best_params_)
base_model = RandomForestRegressor(n_estimators = 2000,min_samples_split=5,
min_samples_leaf=1,
max_features='sqrt',
max_depth=30,
bootstrap=True,
random_state = 42).fit(X_train, Y_train)
y_pred_base=base_model.predict(X_test)
print("r2_score_test:",r2_score(Y_test,y_pred_base))
RMSE_test_based_model=np.sqrt(mean_squared_error(Y_test,base_model.predict(X_test)))
print("RMSE_based:",RMSE_test_based_model)
the link of dataset: https://www.kaggle.com/adityadesai13/used-car-dataset-ford-and-mercedes
Hello friends, I have combined audi.csv, bmw.csv,toyota.csv files to obtain a new CSV file. In the car price estimation algorithm, the test cross-validated RMSE value is greater than the test RMSE (not validated) value. Also, r2_score drops a bit after cross-validation Is this process normal, or what exactly am I doing wrong?
Is the problem related to the regression model?
Before cross validation:
r2_score_test: 0.961865129046153
RMSE: 2293.040184587231
After cross validation:
r2_score_test: 0.9604039571043385
RMSE_based: 2336.5572047970254
fixes:
X=dataset.iloc[:,[1,3,4,5,6,7,8]] #removing of car model column
Y=dataset.iloc[:,2]
#data preprocessing
X['no_year'] = (2022) -X['year']
X.drop(['year'],axis = 1,inplace=True)
print('X:\n',X)
Before cross validation:
r2_score_test: 0.941560662538529
RMSE_test: 2838.5932576738546
After cross validation:
r2_score_based: 0.9603626850597086
RMSE_based: 2337.7746165658878

TypeError with Sklear XGBoost and Sklearn Calibrated Classifier CV

I'm trying to calibrate the output probabilities from a XGBClassifier.
I've provided the sample code,
x_train, x_test, y_train, y_test = train_test_split(x_var, y_var, test_size = 0.2, shuffle = False)
new_mod2 = xgb.XGBClassifier(scale_pos_weight = 1, eta = 0.3, nthread = 10, learning_rate = 0.05, max_depth = 2, n_estimators = 180, objective = 'binary:logistic' )
calibrated = CalibratedClassifierCV( base_estimator = new_mod2,method = 'Isotonic', cv=3)
calibrated.fit(X = x_train, y = y_train)
But when running the calibrated.fit() function, I get the following error:
TypeError: predict_proba() got an unexpected keyword argument 'X'
It's my understanding that the calibratedclassifiercv should work with the sklearn xgboost wrapper.
Yet the calibrated classifier cv seems to be passing the incorrect variable to the predict_proba() method of xgboost.
Is there any reason this might be occurring?

GridSearchCV with Invalid parameter gamma for estimator LogisticRegression

I need to perform a grid search on the parameters listed below for a Logistic Regression classifier, using recall for scoring and cross-validation three times.
The data is in a csv file (11,1 MB), this link for download is: https://drive.google.com/file/d/1cQFp7HteaaL37CefsbMNuHqPzkINCVzs/view?usp=sharing
I have grid_values = {'gamma':[0.01, 0.1, 1, 10, 100]}
I need to apply penalty L1 e L2 in a Logistic Regression
I couldn't verify if the scores will run because I have the following error:
Invalid parameter gamma for estimator LogisticRegression. Check the list of available parameters with estimator.get_params().keys().
This is my code:
from sklearn.model_selection import train_test_split
df = pd.read_csv('fraud_data.csv')
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
def LogisticR_penalty():
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
grid_values = {'gamma':[0.01, 0.1, 1, 10, 100]}
#train de model with many parameters for "C" and penalty='l1'
lr_l1 = LogisticRegression(penalty='l1')
grid_lr_l1 = GridSearchCV(lr_l1, param_grid = grid_values, cv=3, scoring = 'recall')
grid_lr_l1.fit(X_train, y_train)
y_decision_fn_scores_recall = grid_lr_l1.decision_function(X_test)
lr_l2 = LogisticRegression(penalty='l2')
grid_lr_l2 = GridSearchCV(lr_l2, param_grid = grid_values, cv=3 , scoring = 'recall')
grid_lr_l2.fit(X_train, y_train)
y_decision_fn_scores_recall = grid_lr_l2.decision_function(X_test)
#The precision, recall, and accuracy scores for every combination
#of the parameters in param_grid are stored in cv_results_
results = pd.DataFrame()
results['l1_results'] = pd.DataFrame(grid_lr_l1.cv_results_)
results['l1_results'] = results['l2_results'].sort_values(by='mean_test_precision_score', ascending=False)
results['l2_results'] = pd.DataFrame(grid_lr_l2.cv_results_)
results['l2_results'] = results['l2_results'].sort_values(by='mean_test_precision_score', ascending=False)
return results
LogisticR_penalty()
I expected from .cv_results_, the average test scores of each parameter combination that I should be available here: mean_test_precision_score but not sure
The output is: ValueError: Invalid parameter gamma for estimator LogisticRegression. Check the list of available parameters with estimator.get_params().keys().
The error message contains the answer for your question. You can use the function estimator.get_params().keys() to see all available parameters for you estimator:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
print(lr.get_params().keys())
Output:
dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])
From scikit-learn's documentation, the LogisticRegression has no parameter gamma, but a parameter C for the regularization weight.
If you change grid_values = {'gamma':[0.01, 0.1, 1, 10, 100]} for grid_values = {'C':[0.01, 0.1, 1, 10, 100]} your code should work.
My code contained some errors the main error was using param_grid incorrectly. I had to apply L1 and L2 penalties with gamma 0.01, 0.1, 1, 10, 100. The right way to do this is:
grid_values ​​= {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10, 100]}
Then it was necessary to correct the way I was training my logistic regression and to correct the way I retrieved the scores in cv_results_ and averaged those scores.
Follow my code:
from sklearn.model_selection import train_test_split
df = pd.read_csv('fraud_data.csv')
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
def LogisticR_penalty():
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
grid_values = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10, 100]}
#train de model with many parameters for "C" and penalty='l1'
lr = LogisticRegression()
# We use GridSearchCV to find the value of the range that optimizes a given measurement metric.
grid_lr_recall = GridSearchCV(lr, param_grid = grid_values, cv=3, scoring = 'recall')
grid_lr_recall.fit(X_train, y_train)
y_decision_fn_scores_recall = grid_lr_recall.decision_function(X_test)
##The precision, recall, and accuracy scores for every combination
#of the parameters in param_grid are stored in cv_results_
CVresults = []
CVresults = pd.DataFrame(grid_lr_recall.cv_results_)
#test scores and mean of them
split_test_scores = np.vstack((CVresults['split0_test_score'], CVresults['split1_test_score'], CVresults['split2_test_score']))
mean_scores = split_test_scores.mean(axis=0).reshape(5, 2)
return mean_scores
LogisticR_penalty()

GridSearchCV - XGBoost - Early Stopping

i am trying to do hyperparemeter search with using scikit-learn's GridSearchCV on XGBoost. During gridsearch i'd like it to early stop, since it reduce search time drastically and (expecting to) have better results on my prediction/regression task. I am using XGBoost via its Scikit-Learn API.
model = xgb.XGBRegressor()
GridSearchCV(model, paramGrid, verbose=verbose ,fit_params={'early_stopping_rounds':42}, cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX, trainY]), n_jobs=n_jobs, iid=iid).fit(trainX,trainY)
I tried to give early stopping parameters with using fit_params, but then it throws this error which is basically because of lack of validation set which is required for early stopping:
/opt/anaconda/anaconda3/lib/python3.5/site-packages/xgboost/callback.py in callback(env=XGBoostCallbackEnv(model=<xgboost.core.Booster o...teration=4000, rank=0, evaluation_result_list=[]))
187 else:
188 assert env.cvfolds is not None
189
190 def callback(env):
191 """internal function"""
--> 192 score = env.evaluation_result_list[-1][1]
score = undefined
env.evaluation_result_list = []
193 if len(state) == 0:
194 init(env)
195 best_score = state['best_score']
196 best_iteration = state['best_iteration']
How can i apply GridSearch on XGBoost with using early_stopping_rounds?
note: model is working without gridsearch, also GridSearch works without 'fit_params={'early_stopping_rounds':42}
When using early_stopping_rounds you also have to give eval_metric and eval_set as input parameter for the fit method. Early stopping is done via calculating the error on an evaluation set. The error has to decrease every early_stopping_rounds otherwise the generation of additional trees is stopped early.
See the documentation of xgboosts fit method for details.
Here you see a minimal fully working example:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
cv = 2
trainX= [[1], [2], [3], [4], [5]]
trainY = [1, 2, 3, 4, 5]
# these are the evaluation sets
testX = trainX
testY = trainY
paramGrid = {"subsample" : [0.5, 0.8]}
fit_params={"early_stopping_rounds":42,
"eval_metric" : "mae",
"eval_set" : [[testX, testY]]}
model = xgb.XGBRegressor()
gridsearch = GridSearchCV(model, paramGrid, verbose=1 ,
fit_params=fit_params,
cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX,trainY]))
gridsearch.fit(trainX,trainY)
An update to #glao's answer and a response to #Vasim's comment/question, as of sklearn 0.21.3 (note that fit_params has been moved out of the instantiation of GridSearchCV and been moved into the fit() method; also, the import specifically pulls in the sklearn wrapper module from xgboost):
import xgboost.sklearn as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
cv = 2
trainX= [[1], [2], [3], [4], [5]]
trainY = [1, 2, 3, 4, 5]
# these are the evaluation sets
testX = trainX
testY = trainY
paramGrid = {"subsample" : [0.5, 0.8]}
fit_params={"early_stopping_rounds":42,
"eval_metric" : "mae",
"eval_set" : [[testX, testY]]}
model = xgb.XGBRegressor()
gridsearch = GridSearchCV(model, paramGrid, verbose=1,
cv=TimeSeriesSplit(n_splits=cv).get_n_splits([trainX, trainY]))
gridsearch.fit(trainX, trainY, **fit_params)
Here's a solution that works in a Pipeline with GridSearchCV. The challenge occurs when you have a pipeline that is required to pre-process your training data. For example, when X is a text document and you need TFTDFVectorizer to vectorize it.
Over-ride the XGBRegressor or XGBClssifier.fit() Function
This step uses train_test_split() to select the specified number of
validation records from X for the eval_set and then passes the
remaining records along to fit().
A new parameter eval_test_size is added to .fit() to control the number of validation records. (see train_test_split test_size documenation)
**kwargs passes along any other parameters added by the user for the XGBRegressor.fit() function.
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split
class XGBRegressor_ES(XGBRegressor):
def fit(self, X, y, *, eval_test_size=None, **kwargs):
if eval_test_size is not None:
params = super(XGBRegressor, self).get_xgb_params()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=eval_test_size, random_state=params['random_state'])
eval_set = [(X_test, y_test)]
# Could add (X_train, y_train) to eval_set
# to get .eval_results() for both train and test
#eval_set = [(X_train, y_train),(X_test, y_test)]
kwargs['eval_set'] = eval_set
return super(XGBRegressor_ES, self).fit(X_train, y_train, **kwargs)
Example Usage
Below is a multistep pipeline that includes multiple transformations to X. The pipeline's fit() function passes the new evaluation parameter to the XGBRegressor_ES class above as xgbr__eval_test_size=200. In this example:
X_train contains text documents passed to the pipeline.
XGBRegressor_ES.fit() uses train_test_split() to select 200 records from X_train for the validation set and early stopping. (This could also be a percentage such as xgbr__eval_test_size=0.2)
The remaining records in X_train are passed along to XGBRegressor.fit() for the actual fit().
Early stopping may now occur after 75 rounds of unchanged boosting for each cv fold in a gridsearch.
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, f_regression
xgbr_pipe = Pipeline(steps=[('tfidf', TfidfVectorizer()),
('vt',VarianceThreshold()),
('scaler', StandardScaler()),
('Sp', SelectPercentile()),
('xgbr',XGBRegressor_ES(n_estimators=2000,
objective='reg:squarederror',
eval_metric='mae',
learning_rate=0.0001,
random_state=7)) ])
X_train = train_idxs['f_text'].values
y_train = train_idxs['Pct_Change_20'].values
Example Fitting the Pipeline:
%time xgbr_pipe.fit(X_train, y_train,
xgbr__eval_test_size=200,
xgbr__eval_metric='mae',
xgbr__early_stopping_rounds=75)
Example Fitting GridSearchCV:
learning_rate = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3]
param_grid = dict(xgbr__learning_rate=learning_rate)
grid_search = GridSearchCV(xgbr_pipe, param_grid, scoring="neg_mean_absolute_error", n_jobs=-1, cv=10)
grid_result = grid_search.fit(X_train, y_train,
xgbr__eval_test_size=200,
xgbr__eval_metric='mae',
xgbr__early_stopping_rounds=75)

Resources