pipeline = Pipeline([
('scale', RobustScaler(quantile_range=()))
('classify', OneVsRestClassifier(SVC()))
],
memory=self.memory)
Given that pipeline, how to tune the quantile_range in RobustScaler using GridSearchCV? The default quantile_range is (25.0, 75.0). Alternatives I want to try are something like (5.0, 95.0), (10.0, 90.0), ..., (25.0, 75.0). How to achieve that?
I guess, the params_grid should look this:
params_grid = [{'scale__quantile_range': ??}]
But I don't know what to put into the question mark placeholder.
The hyperparameters to try from should be an iterable. Try:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
pipeline = Pipeline([
('scale', RobustScaler(quantile_range=())),
('classify', OneVsRestClassifier(SVC()))
],
memory=None)
params = {"scale__quantile_range":[(25.0,75.0),(10.0,90.0),(1.0,99.0)]}
grid_cf = GridSearchCV(pipeline, param_grid=params)
X,y = make_classification(1000,10,n_classes=2,random_state=42)
grid_cf.fit(X,y)
grid_cf.best_params_
{'scale__quantile_range': (1.0, 99.0)}
Related
About Double-CV or Nested-CV.
The simplest example would be
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
gcv = GridSearchCV(RandomForestRegressor(), param_grid={"n_estimators":[5,10]})
score_ = cross_val_score(gcv , X,y,cv=5)
No question about this.
So, when using the feature_selection of the Wrapper type, there are a method of evaluating with CV (RFECV) and a method of evaluating using all data (RFE), but is RFE correct when using pipeline? This is my first question.
from sklearn.feature_selection import RFE, RFECV
rfr = RandomForestRegressor()
pipe = Pipeline([("selector", RFE(estimator=rfr)), ("estimator", rfr)])
gcv = GridSearchCV(pipe, param_grid={"estimator__n_estimators":[5,10]})
score_ = cross_val_score(gcv , X,y,cv=5)
I feel that the code below with RFECV will result in triple-CV, and the amount of calculation will increase.
from sklearn.feature_selection import RFE, RFECV
pipe = Pipeline([("selector", RFECV(rfr, cv=5)), ("estimator", rfr)])
gcv = GridSearchCV(pipe, param_grid={"estimator__n_estimators":[5,10]})
score_ = cross_val_score(gcv , X,y,cv=5)
Next, in the case of a SequentialFeatureSelector that only has a CV evaluation method, what kind of code is correct as double-CV?
from sklearn.feature_selection import SequentialFeatureSelector
estimator_in_selector = RandomForestRegressor()
sfs = SequentialFeatureSelector (estimator_in_selector , cv=5)
pipe = Pipeline([("selector", sfs), ("estimator", rfr)])
gcv = GridSearchCV(pipe, param_grid=
{"estimator__n_estimators":[5,10]},cv=5)
score_ = cross_val_score(gcv , X,y,cv=5)
If we consider a more complicated case,
from sklearn.feature_selection import SequentialFeatureSelector
estimator_in_selector = RandomForestRegressor()
sfs = SequentialFeatureSelector(estimator_in_selector , cv=5)
pipe = Pipeline([("selector", sfs), ("estimator", rfr)])
param_grid = {"selector__n_features_to_select":[3,5],
"selector__estimator__n_estimators":[10,50],
"estimator__n_estimators":[10,50]}
gcv = GridSearchCV(pipe, param_grid=param_grid)
score_ = cross_val_score(pipe , X,y,cv=5)
And also..when using genetic algorithm.
from sklearn_genetic import GAFeatureSelectionCV
selector = GAFeatureSelectionCV(rfr, cv=5)
I am struggling with a machine learning project, in which I am trying to combine :
a sklearn column transform to apply different transformers to my numerical and categorical features
a pipeline to apply my different transformers and estimators
a GridSearchCV to search for the best parameters.
As long as I fill-in the parameters of my different transformers manually in my pipeline, the code is working perfectly.
But as soon as I try to pass lists of different values to compare in my gridsearch parameters, I am getting all kind of invalid parameter error messages.
Here is my code :
First I divide my features into numerical and categorical
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
numerical_features=make_column_selector(dtype_include=np.number)
cat_features=make_column_selector(dtype_exclude=np.number)
Then I create 2 different preprocessing pipelines for numerical and categorical features:
numerical_pipeline= make_pipeline(KNNImputer())
cat_pipeline=make_pipeline(SimpleImputer(strategy='most_frequent'),OneHotEncoder(handle_unknown='ignore'))
I combined both into another pipeline, set my parameters, and run my GridSearchCV code
model=make_pipeline(preprocessor, LinearRegression() )
params={
'columntransformer__numerical_pipeline__knnimputer__n_neighbors':[1,2,3,4,5,6,7]
}
grid=GridSearchCV(model, param_grid=params,scoring = 'r2',cv=10)
cv = KFold(n_splits=5)
all_accuracies = cross_val_score(grid, X, y, cv=cv,scoring='r2')
I tried different ways to declare the paramaters, but never found the proper one. I always get an "invalid parameter" error message.
Could you please help me understanding what went wrong?
Really a lot of thanks for your support, and take good care!
I am assuming that you might have defined preprocessor as the following,
preprocessor = Pipeline([('numerical_pipeline',numerical_pipeline),
('cat_pipeline', cat_pipeline)])
then you need to change your param name as following:
pipeline__numerical_pipeline__knnimputer__n_neighbors
but, there are couple of other problems with the code:
you don't have to call cross_val_score after performing GridSearchCV. Output of GridSearchCV itself would have the cross validation result for each combination of hyper parameters.
KNNImputer would not work when you data is having string data. You need to apply cat_pipeline before num_pipeline.
Complete example:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
import pandas as pd # doctest: +SKIP
X = pd.DataFrame({'city': ['London', 'London', 'Paris', np.nan],
'rating': [5, 3, 4, 5]}) # doctest: +SKIP
y = [1,0,1,1]
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
numerical_features=make_column_selector(dtype_include=np.number)
cat_features=make_column_selector(dtype_exclude=np.number)
numerical_pipeline= make_pipeline(KNNImputer())
cat_pipeline=make_pipeline(SimpleImputer(strategy='most_frequent'),
OneHotEncoder(handle_unknown='ignore', sparse=False))
preprocessor = Pipeline([('cat_pipeline', cat_pipeline),
('numerical_pipeline',numerical_pipeline)])
model=make_pipeline(preprocessor, LinearRegression() )
params={
'pipeline__numerical_pipeline__knnimputer__n_neighbors':[1,2]
}
grid=GridSearchCV(model, param_grid=params,scoring = 'r2',cv=2)
grid.fit(X, y)
I am new to machine learning and trying to solve a problem of housing prices of kaggle competition.. i am trying to run this code and fit this model but outputs a error..please help and explain as i am a novice...thank in advance
I tried to search in google but shows multiclass error don't know what it is and shows the solution as a "mlogloss" or "merror"
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from learntools.core import *
from xgboost import XGBRegressor
iowa_file_path = '../input/train.csv'
home_data = pd.read_csv(iowa_file_path)
y = home_data.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath',
'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
iowa_model = XGBRegressor(n_estimators=1000,learning_rate=0.05)
iowa_model.fit(train_X, train_y,early_stopping_rounds=5,eval_set=
[(train_X,val_y)],verbose=False)
you got a 'typo' try
iowa_model.fit(train_X, train_y,early_stopping_rounds=5,eval_set= [(val_X,val_y)],verbose=False)
I have a strange error, that I could not understand.
I have a data:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn_pandas import DataFrameMapper
test = pd.DataFrame({"a": ['a','c','-','9','c','a','a','c','b','i','c','r'],
"b": [0,0,1,0,0,1, 0,0,1,0,0,1] })
Then I make DataFrameMapper()
Mapper = DataFrameMapper([ ('a', LabelEncoder()) ])
Then Pipeline()
pipeline = Pipeline([('featurize', Mapper),('forest',RandomForestClassifier())])
X = test[test.columns.drop('b')]
y = test['b']
model = pipeline.fit(X = X, y = y)
Everything works fine, i can predict with this model.
But, when I do cross_val_score
cross_val_score(pipeline, X, y, 'accuracy', cv=2)
It returns error:
a: y contains new labels: ['-' '9']
How can I avoid this or why does it work this way? Because I thought that LabelEncoder fits the data first, then cross-validation goes. I have tried to fit encoder firstly
enc = LabelEncoder()
enc.fit(test['a'])
on entire column then insert in Mapper, but it doesn't work
I'm trying to modify my dataframe to replace all categorical attributes with sparse matrices. I combined 3 pipelines using a FeatureUnion. It works perfectly when I use fit_transform, but gives me an error when I try to do just fit instead.
I want to train this pipeline to use it later on test dataset, that's why I need just fit part.
I'm using Python 3
import pandas as pd
import numpy as np
data = [[3,4,'WN','DEN','SNA',2],[6,1,'WN','FLL','DAL',1],[6,1,'WN','FLL','DAL',1],[6,1,'WN','FLL','DAL',1],[6,1,'WN','FLL','DAL',1],[6,1,'WN','FLL','DAL',1]]
df = pd.DataFrame(data, columns = ['MONTH','DAY_OF_WEEK','AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','SCHEDULED_DEPARTURE'])
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
MONTH_pipeline = Pipeline([
('selector', DataFrameSelector(['MONTH'])),
('label_binarizer', LabelBinarizer()),
])
DAY_OF_WEEK_pipeline = Pipeline([
('selector', DataFrameSelector(['DAY_OF_WEEK'])),
('label_binarizer', LabelBinarizer()),
])
AIRLINE_pipeline = Pipeline([
('selector', DataFrameSelector(['AIRLINE'])),
('label_binarizer', LabelBinarizer()),
])
full_pipeline = FeatureUnion(transformer_list = [
('MONTH_pipeline',MONTH_pipeline),
('DAY_OF_WEEK_pipeline',DAY_OF_WEEK_pipeline),
('AIRLINE_pipeline',AIRLINE_pipeline),
])
train_set_prepared = full_pipeline.fit_transform(df)
full_pipeline.fit(df)
The first command using fit_transform works perfectly and gives a desired answer, but the second one using just fit, gives an error. I would greatly appreciate if someone could help me understand why.