About Sklearn double cross validation with wrapper feature_selection - scikit-learn

About Double-CV or Nested-CV.
The simplest example would be
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
gcv = GridSearchCV(RandomForestRegressor(), param_grid={"n_estimators":[5,10]})
score_ = cross_val_score(gcv , X,y,cv=5)
No question about this.
So, when using the feature_selection of the Wrapper type, there are a method of evaluating with CV (RFECV) and a method of evaluating using all data (RFE), but is RFE correct when using pipeline? This is my first question.
from sklearn.feature_selection import RFE, RFECV
rfr = RandomForestRegressor()
pipe = Pipeline([("selector", RFE(estimator=rfr)), ("estimator", rfr)])
gcv = GridSearchCV(pipe, param_grid={"estimator__n_estimators":[5,10]})
score_ = cross_val_score(gcv , X,y,cv=5)
I feel that the code below with RFECV will result in triple-CV, and the amount of calculation will increase.
from sklearn.feature_selection import RFE, RFECV
pipe = Pipeline([("selector", RFECV(rfr, cv=5)), ("estimator", rfr)])
gcv = GridSearchCV(pipe, param_grid={"estimator__n_estimators":[5,10]})
score_ = cross_val_score(gcv , X,y,cv=5)
Next, in the case of a SequentialFeatureSelector that only has a CV evaluation method, what kind of code is correct as double-CV?
from sklearn.feature_selection import SequentialFeatureSelector
estimator_in_selector = RandomForestRegressor()
sfs = SequentialFeatureSelector (estimator_in_selector , cv=5)
pipe = Pipeline([("selector", sfs), ("estimator", rfr)])
gcv = GridSearchCV(pipe, param_grid=
{"estimator__n_estimators":[5,10]},cv=5)
score_ = cross_val_score(gcv , X,y,cv=5)
If we consider a more complicated case,
from sklearn.feature_selection import SequentialFeatureSelector
estimator_in_selector = RandomForestRegressor()
sfs = SequentialFeatureSelector(estimator_in_selector , cv=5)
pipe = Pipeline([("selector", sfs), ("estimator", rfr)])
param_grid = {"selector__n_features_to_select":[3,5],
"selector__estimator__n_estimators":[10,50],
"estimator__n_estimators":[10,50]}
gcv = GridSearchCV(pipe, param_grid=param_grid)
score_ = cross_val_score(pipe , X,y,cv=5)
And also..when using genetic algorithm.
from sklearn_genetic import GAFeatureSelectionCV
selector = GAFeatureSelectionCV(rfr, cv=5)

Related

Iterate GridSearchCV Over Multiple Datasets and Classifiers (Python)

I have multiple datasets that I want to estimate parameters for using different classifiers (logistic and randomforest).
I want to run each data for both classifiers using gridsearchcv, and then get the best parameters for each classifier per dataset. I am just a bit stumped on how to go about that. My code is below.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# features
X = {'df1': np.random.normal(0, 1, (200, 5)),
'df2': np.random.normal(0, 1, (200, 5))}
# labels
y = {'df1': np.random.choice([0, 1], 200),
'df2': np.random.choice([0, 1], 200)}
num_columns = list(subset_features[1:])
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# the classifiers
clf1 = LogisticRegression(solver='liblinear', random_state=None)
clf2 = RandomForestClassifier(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
params1 = {'classifier__penalty': ['l1', 'l2'],
'classifier__C': [0.1, 1, 10],
'classifier': [clf1]}
params2 = {'classifier__n_estimators': [100, 150, 200],
'classifier__min_samples_leaf': [1, 2],
'classifier' = [clf2]
params = [params1, params2]
gs = GridSearchCV(pipe, params)
gs.fit(X, y)
gs.best_params_
How about this?
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
# Modeling
import lightgbm as lgb
# Evaluation of the model
from sklearn.model_selection import KFold
MAX_EVALS = 500
N_FOLDS = 10
# Read in data and separate into training and testing sets
data = pd.read_csv('C:\\caravan-insurance-challenge.csv')
train = data[data['ORIGIN'] == 'train']
test = data[data['ORIGIN'] == 'test']
# Extract the labels and format properly
train_labels = np.array(train['CARAVAN'].astype(np.int32)).reshape((-1,))
test_labels = np.array(test['CARAVAN'].astype(np.int32)).reshape((-1,))
# Drop the unneeded columns
train = train.drop(columns = ['ORIGIN', 'CARAVAN'])
test = test.drop(columns = ['ORIGIN', 'CARAVAN'])
# Convert to numpy array for splitting in cross validation
features = np.array(train)
test_features = np.array(test)
labels = train_labels[:]
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)
train.head()
import matplotlib.pyplot as plt
import seaborn as sns
plt.hist(labels, edgecolor = 'k');
plt.xlabel('Label'); plt.ylabel('Count'); plt.title('Counts of Labels')
# Model with default hyperparameters
model = lgb.LGBMClassifier()
model
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer
start = timer()
model.fit(features, labels)
train_time = timer() - start
predictions = model.predict_proba(test_features)[:, 1]
auc = roc_auc_score(test_labels, predictions)
print('The baseline score on the test set is {:.4f}.'.format(auc))
print('The baseline training time is {:.4f} seconds'.format(train_time))
import random
lgb.LGBMClassifier()
# Hyperparameter grid
param_grid = {
'class_weight': [None, 'balanced'],
'boosting_type': ['gbdt', 'goss', 'dart'],
'num_leaves': list(range(30, 150)),
'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
'subsample_for_bin': list(range(20000, 300000, 20000)),
'min_child_samples': list(range(20, 500, 5)),
'reg_alpha': list(np.linspace(0, 1)),
'reg_lambda': list(np.linspace(0, 1)),
'colsample_bytree': list(np.linspace(0.6, 1, 10))
}
# Subsampling (only applicable with 'goss')
subsample_dist = list(np.linspace(0.5, 1, 100))
plt.hist(param_grid['learning_rate'], color = 'r', edgecolor = 'k');
plt.xlabel('Learning Rate', size = 14); plt.ylabel('Count', size = 14); plt.title('Learning Rate Distribution', size = 18)
plt.hist(param_grid['num_leaves'], color = 'm', edgecolor = 'k')
plt.xlabel('Learning Number of Leaves', size = 14); plt.ylabel('Count', size = 14); plt.title('Number of Leaves Distribution', size = 18)
# Randomly sample parameters for gbm
params = {key: random.sample(value, 1)[0] for key, value in param_grid.items()}
params
params['subsample'] = random.sample(subsample_dist, 1)[0] if params['boosting_type'] != 'goss' else 1.0
params
Result:
{'class_weight': 'balanced',
'boosting_type': 'goss',
'num_leaves': 58,
'learning_rate': 0.010197109660117238,
'subsample_for_bin': 40000,
'min_child_samples': 230,
'reg_alpha': 0.7755102040816326,
'reg_lambda': 0.7755102040816326,
'colsample_bytree': 0.8666666666666667,
'subsample': 1.0}
Data:
https://www.kaggle.com/datasets/uciml/caravan-insurance-challenge
Source Code:
https://github.com/WillKoehrsen/hyperparameter-optimization/blob/master/Bayesian%20Hyperparameter%20Optimization%20of%20Gradient%20Boosting%20Machine.ipynb
Using different classifiers/estimators, I was able to do what I posted the question for. I am sure, the code can be optimized.
Some of the ideas I used came from this stackoverflow link.
Below is my attempt at answering the question I asked using anomaly detection estimators, instead of logistic regression and randomforest.
# modules
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
# import preprocessing and pipeline modules
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# grid search module
from sklearn.model_selection import GridSearchCV
# the anomaly detection estimators
from sklearn.ensemble import IsolationForest
from inne import IsolationNNE
from scorers import scorer_decision # user defined scoring
# define numeric columns
num_columns = list(df1.columns)
class GSP:
def __init__(self):
pass
def mods(self, x):
num_columns # indicates list of numeric columns in dfs
num_transformer = Pipeline([('imputer', IterativeImputer()),
('scaler', StandardScaler())])
# column transformer
ct = ColumnTransformer([('numeric_pipeline', num_transformer, num_columns)])
# classifiers
clf1 = IsolationForest(n_jobs=-1, random_state=None, bootstrap=False)
clf2 = IsolationNNE(random_state=None)
# pipeline
pipe = Pipeline([('ct', ct), ('classifier', clf1)])
# grid search parameters
num_estimators = list(np.linspace(100, 200, num=5, endpoint=True).astype(int))
max_samples = list(np.linspace(0.70, 1.00, num=5))
contamination = list(np.linspace(0.05, 0.10, num=5, endpoint=True))
max_features = [0.25, 0.50, 0.75, 0.80, 0.90, 1.00]
params1 = {# set isolation forest grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier__max_features': max_features,
'classifier': [clf1]}
params2 = {# set inne grid parameters
'classifier__n_estimators': num_estimators,
'classifier__max_samples': max_samples,
'classifier__contamination': contamination,
'classifier': [clf2]}
params = [params1, params2]
gsresults = pd.DataFrame()
for key in x.keys():
print('running key:', key)
gs = GridSearchCV(estimator=pipe,
param_grid=params,
cv=2,
n_jobs=4,
verbose=1,
scoring=scorer_decision,
error_score='raise',
refit=True)
# fit the model
gs.fit(x[key])
dftemp = pd.DataFrame(gs.cv_results_)
dftemp['dataset'] = key
gsresults = pd.concat([gsresults, dftemp], ignore_index=True)
gsresults = (gsresults.set_index(gsresults['params']
.apply(lambda x: ' '.join(str(val) for val in x.values()))).rename_axis('kernel'))
selected_columns = ['dataset', 'mean_test_score', 'rank_test_score',
'param_classifier', 'param_classifier__contamination',
'param_classifier__max_features', 'param_classifier__max_samples',
'param_classifier__n_estimators']
gsresults2 = (gsresults.loc[:, selected_columns]
.rename(columns={'mean_test_score': 'mean_score',
'rank_test_score': 'rank_score',
'param_classifier': 'classifier',
'param_classifier__contamination': 'contamination',
'param_classifier__max_features': 'max_features',
'param_classifier__max_samples': 'max_samples',
'param_classifier__n_estimators': 'n_estimators'}))
gsresults3 = (gsresults2.sort_values(['rank_score', 'mean_score'], ascending=True)
.groupby(['dataset']))
# check output by dataframes
dfs = {}
for key, df in gsresults3:
dfs[key] = df
return dfs
# running the mods method below returns a dictionary of dataframes
best_params = GSP().mods(X)
Note: the models are fitted on a dictionary of dataframes, X.

Get support and ranking attributes for RFE using Pipeline in Python 3

The code I have so far is below and it works perfectly. However, I would like to print the following RFE attributes for each number of features tested: "rfe.support_[i]", "rfe.ranking_[i]" and the name of the selected features since "i" refers to the index, the first attribute returns True or False (if the columns were selected or not) and the second one returns their respective rankings.
In other words, I would like to print the columns considered in each RFE and that they do not remain as something abstract.
# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Get the dataset
def get_dataset(df, target):
X, y = df.drop(columns = target), df[[target]].values.flatten()
return X, y
# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
('one-hot-encoder', OneHotEncoder())])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
('cat', cat_transformer, list_cat_cols)])
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)])
return models
# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
scores = cross_val_score(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv,
n_jobs = -1, error_score = 'raise')
return scores
# Define the dataset
X, y = get_dataset(my_df, 'my_target') # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(),
X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
The following is returning errors:
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_[0] # Returns: AttributeError: 'RFE' object has no attribute 'support_'
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_[0] # Returns: AttributeError: 'RFE' object has no attribute 'ranking_'
Point is that you haven't explicitly fitted the 'DecisionTreeRegressor_2' pipeline.
Indeed, though cross_val_score already takes care of fitting the estimator as you might see here, cross_val_score does not return the estimator instance, as .fit() method does. Therefore you're not able to access the RFE instance attributes.
Here's a toy example from your setting:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
X, y = make_regression()
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(
[
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)
])
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_ # this does not work
You might see, instead, that after fitting your model, you'll be able to access the support_ and ranking_ attributes:
models['DecisionTreeRegressor_2'].fit(X,y)
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].support_ # this works
models['DecisionTreeRegressor_2'].named_steps['s_dtr'].ranking_ # this works
I answered the question. I'm posting it in case it can help someone. It consists of using "cross_validate", instead of "cross_val_score", with the option "return_estimator = True" to be able to retrieve the pipelines in the different folds and RFE, and access them by index. Then you can use "named_steps".
# Explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Get the dataset
def get_dataset(df, target):
X, y = df.drop(columns = target), df[[target]].values.flatten()
return X, y
# Get a list of models to evaluate
def get_models(list_num_cols, list_cat_cols):
num_transformer = Pipeline(steps = [('num_imputer', SimpleImputer(strategy = 'median'))])
cat_transformer = Pipeline(steps = [('cat_imputer', SimpleImputer(strategy = 'most_frequent')),
('one-hot-encoder', OneHotEncoder())])
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, list_num_cols),
('cat', cat_transformer, list_cat_cols)])
models = dict()
for i in range(2, 4):
rfe_dtr = RFE(estimator = DecisionTreeRegressor(), n_features_to_select = i)
model_dtr = DecisionTreeRegressor()
models['DecisionTreeRegressor_' + str(i)] = Pipeline(steps = [('preprocessor', preprocessor),
('s_dtr', rfe_dtr),
('m_dtr', model_dtr)])
return models
# Evaluate a give model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 7)
output = cross_validate(model, X, y, scoring = 'neg_mean_absolute_error', cv = cv,
n_jobs = -1, error_score = 'raise', return_estimator = True)
return output
# Define the dataset
X, y = get_dataset(my_df, 'my_target') # It begins here
# Get the models to evaluate
models = get_models(X.select_dtypes(include = 'number').columns.tolist(),
X.select_dtypes(include = 'object').columns.tolist())
# Evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
output = evaluate_model(model, X, y)
results.append(output['test_score'])
names.append(name)
print('%s %.3f (%.3f)' % (name, mean(output['test_score']), std(output['test_score'])))
print(output)
print(output['estimator'][0].named_steps['s_dtr'].support_)
print(output['estimator'][0].named_steps['s_dtr'].ranking_)
print(output['estimator'][0].named_steps['s_dtr'].support_[2])
print(output['estimator'][0].named_steps['s_dtr'].ranking_[2])

Get feature names of ColumnTransformer using StandarScaler and One-Hot-Encoding

I am using a simple ColumnTransformer with StandardScaler and OneHotEncoder like:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
num_features = ['num_feat_1',
'num_feat_2',
'num_feat_3']
cat_features = ['cat_feat_1',
'cat_feat_2',
'cat_feat_3']
ct = ColumnTransformer([
("scaler", StandardScaler(), num_features),
("onehot", OneHotEncoder(sparse=False,
handle_unknown='ignore'), cat_features)],
remainder='passthrough')
ct.fit(X_train)
X_train_trans = ct.transform(X_train)
X_test_trans = ct.transform(X_test)
To map the coefficients of a LinearRegression, I need ct.get_feature_names(), but I get the error Transformer scaler (type StandardScaler) does not provide get_feature_names. Why is that and how can I solve this?
In your case, get_feature_names() will work only on the onehot , and for StandardScaler() you would not change the names of the transformed variable, so we go through the transformers, if the get_feature doesn't work, we retain the original feature names.
Using an example dataset:
import pandas as pd
import numpy as np
X = pd.concat([
pd.DataFrame(np.random.uniform(0,1,(100,3)),columns=num_features),
pd.DataFrame(np.random.choice(['a','b'],(100,3)),columns=cat_features)
],axis=1)
X_train = X.iloc[:50,:]
X_test = X.iloc[50:,:]
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
num_features = ['num_feat_1',
'num_feat_2',
'num_feat_3']
cat_features = ['cat_feat_1',
'cat_feat_2',
'cat_feat_3']
ct = ColumnTransformer([
("scaler", StandardScaler(), num_features),
("onehot", OneHotEncoder(sparse=False,
handle_unknown='ignore'), cat_features)],
remainder='passthrough')
ct.fit(X_train)
We try this:
tx = ct.get_params()['transformers']
feature_names = []
for name,transformer,features in tx:
try:
Var = ct.named_transformers_[name].get_feature_names().tolist()
except AttributeError:
Var = features
feature_names = feature_names + Var
feature_names
['num_feat_1',
'num_feat_2',
'num_feat_3',
'x0_a',
'x0_b',
'x1_a',
'x1_b',
'x2_a',
'x2_b']

How to tune quantile_range in RobustScaler in sklearn Pipeline?

pipeline = Pipeline([
('scale', RobustScaler(quantile_range=()))
('classify', OneVsRestClassifier(SVC()))
],
memory=self.memory)
Given that pipeline, how to tune the quantile_range in RobustScaler using GridSearchCV? The default quantile_range is (25.0, 75.0). Alternatives I want to try are something like (5.0, 95.0), (10.0, 90.0), ..., (25.0, 75.0). How to achieve that?
I guess, the params_grid should look this:
params_grid = [{'scale__quantile_range': ??}]
But I don't know what to put into the question mark placeholder.
The hyperparameters to try from should be an iterable. Try:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
pipeline = Pipeline([
('scale', RobustScaler(quantile_range=())),
('classify', OneVsRestClassifier(SVC()))
],
memory=None)
params = {"scale__quantile_range":[(25.0,75.0),(10.0,90.0),(1.0,99.0)]}
grid_cf = GridSearchCV(pipeline, param_grid=params)
X,y = make_classification(1000,10,n_classes=2,random_state=42)
grid_cf.fit(X,y)
grid_cf.best_params_
{'scale__quantile_range': (1.0, 99.0)}

LabelEncoder in sklearn_pandas mapper with pipeline after cross_val_score returns error

I have a strange error, that I could not understand.
I have a data:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn_pandas import DataFrameMapper
test = pd.DataFrame({"a": ['a','c','-','9','c','a','a','c','b','i','c','r'],
"b": [0,0,1,0,0,1, 0,0,1,0,0,1] })
Then I make DataFrameMapper()
Mapper = DataFrameMapper([ ('a', LabelEncoder()) ])
Then Pipeline()
pipeline = Pipeline([('featurize', Mapper),('forest',RandomForestClassifier())])
X = test[test.columns.drop('b')]
y = test['b']
model = pipeline.fit(X = X, y = y)
Everything works fine, i can predict with this model.
But, when I do cross_val_score
cross_val_score(pipeline, X, y, 'accuracy', cv=2)
It returns error:
a: y contains new labels: ['-' '9']
How can I avoid this or why does it work this way? Because I thought that LabelEncoder fits the data first, then cross-validation goes. I have tried to fit encoder firstly
enc = LabelEncoder()
enc.fit(test['a'])
on entire column then insert in Mapper, but it doesn't work

Resources