Is there a way we can grid-search multiple estimators at a time in Sklearn or any other library. For example can we pass SVM and Random Forest in one grid search ?.
Yes. Example:
pipeline = Pipeline([
('vect', CountVectorizer()),
('clf', SGDClassifier()),
])
parameters = [
{
'vect__max_df': (0.5, 0.75, 1.0),
'clf': (SGDClassifier(),),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l2', 'elasticnet'),
'clf__n_iter': (10, 50, 80),
}, {
'vect__max_df': (0.5, 0.75, 1.0),
'clf': (LinearSVC(),),
'clf__C': (0.01, 0.5, 1.0)
}
]
grid_search = GridSearchCV(pipeline, parameters)
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
class DummyEstimator(BaseEstimator):
def fit(self): pass
def score(self): pass
# Create a pipeline
pipe = Pipeline([('clf', DummyEstimator())]) # Placeholder Estimator
# Candidate learning algorithms and their hyperparameters
search_space = [{'clf': [LogisticRegression()], # Actual Estimator
'clf__penalty': ['l1', 'l2'],
'clf__C': np.logspace(0, 4, 10)},
{'clf': [DecisionTreeClassifier()], # Actual Estimator
'clf__criterion': ['gini', 'entropy']}]
# Create grid search
gs = GridSearchCV(pipe, search_space)
I think what you were looking for is this:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
names = [
"Naive Bayes",
"Linear SVM",
"Logistic Regression",
"Random Forest",
"Multilayer Perceptron"
]
classifiers = [
MultinomialNB(),
LinearSVC(),
LogisticRegression(),
RandomForestClassifier(),
MLPClassifier()
]
parameters = [
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__alpha': (1e-2, 1e-3)},
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__C': (np.logspace(-5, 1, 5))},
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__C': (np.logspace(-5, 1, 5))},
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__max_depth': (1, 2)},
{'vect__ngram_range': [(1, 1), (1, 2)],
'clf__alpha': (1e-2, 1e-3)}
]
for name, classifier, params in zip(names, classifiers, parameters):
clf_pipe = Pipeline([
('vect', TfidfVectorizer(stop_words='english')),
('clf', classifier),
])
gs_clf = GridSearchCV(clf_pipe, param_grid=params, n_jobs=-1)
clf = gs_clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("{} score: {}".format(name, score))
You can use TransformedTargetRegressor.
This class is designed for transforming the target variable before fitting, taking a regressor and a set of transformers as parameters. But you may give no transformer, then the identity transformer (i.e. no transformation) is applied. Since regressor is a class parameter, we can change it by grid search objects.
import numpy as np
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
Y = np.array([1,2,3,4,5,6,7,8,9,10])
X = np.array([0,1,3,5,3,5,7,9,8,9]).reshape((-1, 1))
For doing grid search, we should specify the param_grid as a list of dict, each for different estimator. This is because different estimators use different set of parameters (e.g. setting fit_intercept with MLPRegressor causes error).
Note that the name "regressor" is automatically given to the regressor.
model = TransformedTargetRegressor()
params = [
{
"regressor": [LinearRegression()],
"regressor__fit_intercept": [True, False]
},
{
"regressor": [MLPRegressor()],
"regressor__hidden_layer_sizes": [1, 5, 10]
}
]
We can fit as usual.
g = GridSearchCV(model, params)
g.fit(X, Y)
g.best_estimator_, g.best_score_, g.best_params_
# results in like
(TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
regressor=LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
normalize=False),
transformer=None),
-0.419213380219391,
{'regressor': LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
normalize=False), 'regressor__fit_intercept': False})
What you can do is create a class that takes in any classifier and for each classifier any setting of parameters.
Create a switcher class that works for any estimator
from sklearn.base import BaseEstimator
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = SGDClassifier(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y=None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y=None):
return self.estimator.predict(X)
def predict_proba(self, X):
return self.estimator.predict_proba(X)
def score(self, X, y):
return self.estimator.score(X, y)
Now you can pre-train your tfidf however you like.
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(data, labels)
Now create a pipeline with this pre-trained tfidf
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('tfidf',tfidf), # Already pretrained/fit
('clf', ClfSwitcher()),
])
Perform hyper-parameter optimization
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
parameters = [
{
'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
'clf__estimator__max_iter': [50, 80],
'clf__estimator__tol': [1e-4],
'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
},
{
'clf__estimator': [MultinomialNB()],
'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
},
]
gscv = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, verbose=3)
# param optimization
gscv.fit(train_data, train_labels)
How to interpret clf__estimator__loss
clf__estimator__loss is interpreted as the loss parameter for whatever estimator is, where estimator = SGDClassifier() in the top most example and is itself a parameter of clf which is a ClfSwitcher object.
Related
I am just a beginner in data analysis. I want to use 'Cross-validation Grid Search method" to determine the parameters gamma and C of the Radial Basis Function (RBF) kernel SVM. I don't know where I should put my data on this code, and what data type I should use (training or target data)?
For SVR
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error,explained_variance_score
from TwoStageTrAdaBoostR2 import TwoStageTrAdaBoostR2 # import the two-stage algorithm
from sklearn import preprocessing
from sklearn import svm
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from matplotlib.colors import Normalize
from sklearn.svm import SVC
# Data import (source)
source= pd.read_csv(sourcedata)
# Data import (target)
data= pd.read_csv(targetdata)
# Sample Size
datatrain = data.sample(n=60, random_state=1)
datatest = data[~dataL.index.isin(data.index)]
# Merge training set data (source and target)
train = pd.concat([source, datatrain], sort=False)
train.reset_index(inplace=True, drop=True)
datatest.reset_index(inplace=True, drop=True)
# Variable input
X_train, y_train = train[['x1', 'x2']].values, train['y'].values
X_test, y_test = FL[['x1', 'x2']].values, FL['y'].values
# Parameter setting
#sample_size = [n_source1+n_source2+n_source3+n_source4+n_source5, n_target_train]
n_estimators = 100
steps = 8
fold = 5
random_state = np.random.RandomState(1)
sample_size = [350, 60]
#1 twostage tradaboost.r2
regr_1 = TwoStageTrAdaBoostR2(SVR(C=50, gamma='auto'),
n_estimators = n_estimators, sample_size = sample_size,
steps = steps, fold = fold,
random_state = random_state)
regr_1.fit(X_train, y_train)
y_pred1 = regr_1.predict(X_test)
print("MSE of regular two stage trAdaboostR2--model1:",sqrt(mean_squared_error(y_test, y_pred1)))
#Plot the results
plt.figure()
plt.scatter(y_test, y_test-y_pred1, c="black", label="TwoStageTrAdaBoostR2_model1", s=10)
plt.xlabel("CAR")
plt.ylabel("Err")
plt.title("Two-stage Transfer Learning Boosted Decision Tree Regression", loc='left', fontsize=12, fontweight=0, color="orange")
plt.legend()
plt.show()
for cross-validation grid search methods(best parameters):
# Cross validation grid search (best parameters)
parameter_candidates = [
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters, c=5 ,n_jobs=-1)
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
For visualization of parameter effects
c_range = np.logspace(-2, 2, 4)
gamma_range = np.logspace(-2, 2, 5)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
{'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]
svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=2,n_jobs=-1,
scoring='explained_variance')
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
len(c_range))
# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
len(c_range))
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(c_range)), c_range)
plt.title('Validation accuracy')
plt.show()
When I used this code, I found the following output Heatmap plot!
But I am trying to get a Heatmap like this one
The following code with some typical regression data should work all the way through:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV,train_test_split
from matplotlib.colors import Normalize
class MidpointNormalize(Normalize):
def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
self.midpoint = midpoint
Normalize.__init__(self, vmin, vmax, clip)
def __call__(self, value, clip=None):
x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
return np.ma.masked_array(np.interp(value, x, y))
X, y = datasets.load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
# Cross validation grid search (best parameters)
c_range = np.logspace(-0, 4, 8)
gamma_range = np.logspace(-4, 0, 8)
tuned_parameters = [{'kernel': ['rbf'],'C': c_range,'gamma':gamma_range},
{'kernel': ['linear'], 'C': c_range,'gamma':gamma_range}]
svr = svm.SVR()
clf = GridSearchCV(svr,param_grid=tuned_parameters,verbose=20,n_jobs=-4,cv=4,
scoring='explained_variance')
clf.fit(X_train, y_train)
print('Best score for data:', clf.best_score_)
print('Best C:',clf.best_estimator_.C)
print('Best Kernel:',clf.best_estimator_.kernel)
print('Best Gamma:',clf.best_estimator_.gamma)
# scores for rbf kernel
n = len(gamma_range)*len(c_range)
scores_rbf = clf.cv_results_['mean_test_score'][:n].reshape(len(gamma_range),
len(c_range))
# scores for rbf kernel
scores_linear = clf.cv_results_['mean_test_score'][n:].reshape(len(gamma_range),
len(c_range))
plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores_rbf, interpolation='nearest', cmap=plt.cm.hot,
norm=MidpointNormalize(vmin=-.2, midpoint=0.5))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)),
[np.format_float_scientific(i,1) for i in gamma_range],rotation=45)
plt.yticks(np.arange(len(c_range)),
[np.format_float_scientific(i,) for i in c_range])
plt.title('Validation accuracy')
plt.show()
The granularity of the grid is very low but it takes some time run otherwise. Also the limits of the grid will need to be more educated that the ones I chose.
I'm not sure why you get the error you get but I kept things simple and initiated the SVR once in my snippet so you can see how it works. I've also used different lengths for the C and gamma arrays that's just to show how these parameters are carried through. Sometimes I find that if everything has the same length is difficult to see which parameter is responsible for what.
The final plot looks like that but this depends heavily on the range of the grid, its granularity and the dataset that you are working with. Also note that I change the parameters of the MidpointNormalize class you provided.
I ran across an example on parameters tuning with Grid search and text data using TfidfVectorizer() in the pipeline.
As far as I've understood is that when we call grid_search.fit(X_train, y_train) it will transform the data then fit the model as it is described in a dictionary. However during the evaluation, I'm a bit confused with the test dataset, since when we call grid_search.predict(X_test) I don't know whether/(how) the TfidfVectorizer() is applied on this test chunk.
Thanks
David
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_
score
pipeline = Pipeline([
('vect', TfidfVectorizer(stop_words='english')),
('clf', LogisticRegression())
])
parameters = {
'vect__max_df': (0.25, 0.5, 0.75),
'vect__stop_words': ('english', None),
'vect__max_features': (2500, 5000, 10000, None),
'vect__ngram_range': ((1, 1), (1, 2)),
'vect__use_idf': (True, False),
'vect__norm': ('l1', 'l2'),
'clf__penalty': ('l1', 'l2'),
'clf__C': (0.01, 0.1, 1, 10),
}
if __name__ == "__main__":
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,
verbose=1, scoring='accuracy', cv=3)
df = pd.read_csv('data/sms.csv')
X, y, = df['message'], df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y)
grid_search.fit(X_train, y_train)
print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print '\t%s: %r' % (param_name, best_parameters[param_name])
predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print 'Precision:', precision_score(y_test, predictions)
print 'Recall:', recall_score(y_test, predictions)
This is an example of scikit-learn pipelines magic. It works like this:
First, you define elements of a pipeline with Pipeline constructor - all data, whether on train or test (predict) stage, will be processed through all the defined steps - in this case by TfidfVectorizer and then the output will be passed to LogisticRegression model.
Passing defined pipeline to GridSearchCV constructor allows you to use the method fit, that not only performs grid search but also internally sets both TfidfVectorizer and LogisticRegression to best found parameters, so later running predict does so on best-found models.
You can find more info on creating pipelines in scikit-learn documentation.
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.2, random_state = 10)
You have to preprocess data before feeding your model. Here is a complete working example. First, let's import the required modules:
from datetime import datetime
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
Then, define the training data:
X = ['17:00','17:05', '17:10', '17:15', '17:20', '17:25']
X = np.array(X).reshape(-1, 1)
y = [1, 0, 1, 1, 0, 1]
Note, the X must be 2D array. Also, you have to convert time string values to the numerical format. One way to do it is to convert strings to timestamp using the builtin datetime module. Here is a function which will be used to transform the data:
def transform(X, y=None):
X_new = np.apply_along_axis(
lambda x: [datetime.strptime(x[0], '%H:%M').timestamp()],
axis=1,
arr=X)
return X_new
Don't forget to scale your data since SVC models require data normalization. One can easily combine all the preprocessing steps using the Pipeline:
pipeline = Pipeline(steps=[
('transformer', FunctionTransformer(transform, validate=False)),
('scaler', MinMaxScaler()),
('predictor', SVC(kernel='linear'))
])
Finally, let's fit the model:
print('Build and fit a model...')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print('Done. Score', score)
I am trying to figure out how to define the paramerer grid for the a MLPR with two hidden layers for input into RandomSearchCV in SkLearn?
Below is what I have been trialing. So, how can I randomise the hidden_layer_sizes for RandomSearchCV?
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import RandomizedSearchCV
boston = load_boston()
X = boston.data
y = boston.target
params = {'activation':['logistic', 'relu'],
'learning_rate':['adaptive'],
'alpha':np.logspace(0.0001, 100, 10),
'max_iter':[1000],
'hidden_layer_sizes':[(10,10), (30,10), (50,20), (60,30)]}
reg = MLPRegressor()
random_search = RandomizedSearchCV(estimator = reg,
param_distributions=params,
n_iter=10,
scoring = 'neg_mean_squared_error',
cv=3,
n_jobs = -3,
pre_dispatch = '2*n_jobs',
return_train_score = True)
random_search.fit(X,y)
df = pd.DataFrame(random_search.cv_results_)
df['train_RMSE'] = np.sqrt(-df['mean_train_score'])
df['test_RMSE'] = np.sqrt(-df['mean_test_score'])
print(random_search.best_params_)
PS: If anyone also has any comments on my selection of parameters then please feel free to comment. The parameters are to be used for a regression problem with up to 7 inputs.
Any ideas?
Yes, you did it right. In addition you can set the verbose level to see the used hyper parameters of the last cross validation, e.g. [CV] activation=tanh, alpha=1e+100, hidden_layer_sizes=(30, 10), score=-4.180054117738231, total= 2.7s.
I chose a GridSearchCV instead of a RandomizedSearchCV to find the best parameter set and on my machine it took five minutes.
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import explained_variance_score
X, y = load_boston(return_X_y=True)
# Split data for final evaluation:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)
# Define base regressor:
base_reg = MLPRegressor(learning_rate='adaptive', max_iter=5000, random_state=42)
# Define search space:
params = {
'activation': ['logistic', 'relu', 'tanh'], # <-- added 'tanh' as third non-linear activation function
'alpha': np.logspace(0.0001, 100, 10),
'hidden_layer_sizes': [
(10, 10), (20, 10), (30, 10),
(40, 10), (90, 10), (90, 30, 10) # <-- added more neurons or layers
]
}
# Find best hyper params and then refit on all training data:
reg = GridSearchCV(estimator=base_reg, param_grid=params,
n_jobs=8, cv=3, refit=True, verbose=5) # <-- verbose=5
reg.fit(X_train, y_train)
print(reg.best_estimator_)
# MLPRegressor(activation='logistic', alpha=1.0002302850208247,
# batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False,
# epsilon=1e-08, hidden_layer_sizes=(30, 10),
# learning_rate='adaptive', learning_rate_init=0.001, max_iter=5000,
# momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
# power_t=0.5, random_state=42, shuffle=True, solver='adam',
# tol=0.0001, validation_fraction=0.1, verbose=False,
# warm_start=False)
print(reg.best_params_)
# {'activation': 'logistic', 'alpha': 1.0002302850208247, 'hidden_layer_sizes': (30, 10)}
# Evaluate on unseen test data:
err = explained_variance_score(y_train, reg.predict(X_train))
print(err) # 0.8936815412058757
err = explained_variance_score(y_test, reg.predict(X_test))
print(err) # 0.801353064635174
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
I have X and Y data.
data = load_iris()
X = data.data
Y = data.target
I would like to implement RFECV feature selection and prediction with k-fold validation approach.
code corrected from the answer # https://stackoverflow.com/users/3374996/vivek-kumar
clf = RandomForestClassifier()
kf = KFold(n_splits=2, shuffle=True, random_state=0)
estimators = [('standardize' , StandardScaler()),
('clf', clf)]
class Mypipeline(Pipeline):
#property
def coef_(self):
return self._final_estimator.coef_
#property
def feature_importances_(self):
return self._final_estimator.feature_importances_
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, cv=kf, scoring='accuracy', verbose=10)
rfecv_data = rfecv.fit(X, Y)
print ('no. of selected features =', rfecv_data.n_features_)
EDIT (for small remaining):
X_new = rfecv.transform(X)
print X_new.shape
y_predicts = cross_val_predict(clf, X_new, Y, cv=kf)
accuracy = accuracy_score(Y, y_predicts)
print ('accuracy =', accuracy)
Instead of wrapping StandardScaler and RFECV in a same pipeline, do that for StandardScaler and RandomForestClassifier and pass that pipeline to the RFECV as an estimator. In this no traininf info will be leaked.
estimators = [('standardize' , StandardScaler()),
('clf', RandomForestClassifier())]
pipeline = Pipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy')
rfecv_data = rfecv.fit(X, Y)
Update: About the error 'RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes'
Yes thats a known issue in scikit-learn pipeline. You can look at my other answer here for more details and use the new pipeline I created there.
Define a custom pipeline like this:
class Mypipeline(Pipeline):
#property
def coef_(self):
return self._final_estimator.coef_
#property
def feature_importances_(self):
return self._final_estimator.feature_importances_
And use that:
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy')
rfecv_data = rfecv.fit(X, Y)
Update 2:
#brute, For your data and code, the algorithms completes within a minute on my PC. This is the complete code I use:
import numpy as np
import glob
from sklearn.utils import resample
files = glob.glob('/home/Downloads/Untitled Folder/*')
outs = []
for fi in files:
data = np.genfromtxt(fi, delimiter='|', dtype=float)
data = data[~np.isnan(data).any(axis=1)]
data = resample(data, replace=False, n_samples=1800, random_state=0)
outs.append(data)
X = np.vstack(outs)
print X.shape
Y = np.repeat([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1800)
print Y.shape
#from sklearn.utils import shuffle
#X, Y = shuffle(X, Y, random_state=0)
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
clf = RandomForestClassifier()
kf = KFold(n_splits=10, shuffle=True, random_state=0)
estimators = [('standardize' , StandardScaler()),
('clf', RandomForestClassifier())]
class Mypipeline(Pipeline):
#property
def coef_(self):
return self._final_estimator.coef_
#property
def feature_importances_(self):
return self._final_estimator.feature_importances_
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy', verbose=10)
rfecv_data = rfecv.fit(X, Y)
print ('no. of selected features =', rfecv_data.n_features_)
Update 3: For cross_val_predict
X_new = rfecv.transform(X)
print X_new.shape
# Here change clf to pipeline,
# because RFECV has found features according to scaled data,
# which is not present when you pass clf
y_predicts = cross_val_predict(pipeline, X_new, Y, cv=kf)
accuracy = accuracy_score(Y, y_predicts)
print ('accuracy =', accuracy)
Here's how we'll do it:
Fit on the training set
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
X = data.data, Y = data.target
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=True)
# create model
clf = RandomForestClassifier()
# instantiate K-Fold
kf = KFold(n_splits=10, shuffle=True, random_state=0)
# pipeline estimators
estimators = [('standardize' , StandardScaler()),
('rfecv', RFECV(estimator=clf, cv=kf, scoring='accuracy'))]
# instantiate pipeline
pipeline = Pipeline(estimators)
# fit rfecv to train model
rfecv_model = rfecv_model = pipeline.fit(X_train, y_train)
# print number of selected features
print ('no. of selected features =', pipeline.named_steps['rfecv'].n_features_)
# print feature ranking
print ('ranking =', pipeline.named_steps['rfecv'].ranking_)
'Output':
no. of selected features = 3
ranking = [1 2 1 1]
Predict on the test set
# make predictions on the test set
predictions = rfecv_model.predict(X_test)
# evaluate the model performance using accuracy metric
print("Accuracy on test set: ", accuracy_score(y_test, predictions))
'Output':
Accuracy: 0.9736842105263158