using class weights with sklearn votingClassifier - scikit-learn

I have an imbalance dataset for a classification problem. My target variable is binary and has two category.
I implemented Random Forest and Logistic Regression by assigning class_weights as parameter.
When I fit data to random forest and logistic regression separately it works fine. But when I use Voting Classifier on random forest and logistic regression from sklearn.ensemble to fit on the data it gives error Class label no_payment not present. I need to take ensemble of 3 or more models. I have check that this error is not because of naive_bayes implemented in the code.
My code:
rf_param = { 'class_weight': {'no_payment': 1, 'payment': 3},'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 30, 'min_samples_split': 15, 'n_estimators': 100}
lr_param = {'C': 0.1, 'class_weight': {'no_payment': 1, 'payment': 3}, 'fit_intercept': False, 'penalty': 'l2'}
rf = ensemble.RandomForestClassifier(**rf_param)
lr = linear_model.LogisticRegression(**lr_param)
nb = naive_bayes.MultinomialNB(alpha=0.0, class_prior=None, fit_prior=False)
rf.fit(train_x, train_y)
lr.fit(train_x, train_y)
nb.fit(train_x, train_y)
model = ensemble.VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('nb',nb)], voting='hard'
,weights = [2,2,1])
model.fit(train_x, train_y)
predictions = model.predict(valid_x)
This code runs perfectly if I remove class_weight from parameter list.
Below is complete error message.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-e05cd516f347> in <module>()
15 )
16
---> 17 model.fit(train_x, train_y)
18
19 predictions = model.predict(valid_x)
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_voting.py in fit(self, X, y, sample_weight)
220 transformed_y = self.le_.transform(y)
221
--> 222 return super().fit(X, transformed_y, sample_weight)
223
224 def predict(self, X):
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_voting.py in fit(self, X, y, sample_weight)
66 delayed(_parallel_fit_estimator)(clone(clf), X, y,
67 sample_weight=sample_weight)
---> 68 for clf in clfs if clf not in (None, 'drop')
69 )
70
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
/home/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
/home/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_base.py in _parallel_fit_estimator(estimator, X, y, sample_weight)
34 raise
35 else:
---> 36 estimator.fit(X, y)
37 return estimator
38
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight)
319 self.n_outputs_ = y.shape[1]
320
--> 321 y, expanded_class_weight = self._validate_y_class_weight(y)
322
323 if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in _validate_y_class_weight(self, y)
585 class_weight = self.class_weight
586 expanded_class_weight = compute_sample_weight(class_weight,
--> 587 y_original)
588
589 return y, expanded_class_weight
/home/.local/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_sample_weight(class_weight, y, indices)
161 weight_k = compute_class_weight(class_weight_k,
162 classes_full,
--> 163 y_full)
164
165 weight_k = weight_k[np.searchsorted(classes_full, y_full)]
/home/.local/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_class_weight(class_weight, classes, y)
63 i = np.searchsorted(classes, c)
64 if i >= len(classes) or classes[i] != c:
---> 65 raise ValueError("Class label {} not present.".format(c))
66 else:
67 weight[i] = class_weight[c]
ValueError: Class label no_payment not present.

Related

XGBoost: While using the `eval_set` in .fit causing Error

I'm trying to train the model using Xgboost. The code is doing split using KFold. And for each fold, it's running the Xgboost model using fit. Within the fit function, I'm trying to evaluate both train and valid data to check if the errors. And then doing the prediction in test set.
I'm running the following code using Xgboost.
kf = GroupKFold(n_splits=4)
for trn_idx, test_idx in kf.split(X, groups=X.year) :
x_train, x_valid = X.iloc[trn_idx], X.iloc[test_idx]
y_train, y_valid = y.iloc[trn_idx], y.iloc[test_idx]
xgb_model = xgb.XGBRegressor(
booster = 'dart',
eta = 0.1,
gamma = 0,
colsample_bytree = 0.7,
n_estimators = 1200,
max_depth = 1,
reg_alpha = 1.1,
reg_lambda = 1.1,
subsample = 0.03,
eval_metric=my_smape)
xgb_model.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_valid,y_valid)], early_stopping_rounds=20,
verbose=True)
But I'm getting the following error. I checked this doc, and my code is according to the doc. Can someone please help me find the solution?
AttributeError Traceback (most recent call last)
<ipython-input-38-81b11a21472c> in <module>
23 eval_metric=my_smape)
24
---> 25 xgb_model.fit(x_train, y_train,
26 eval_set=[(x_valid,y_valid)], early_stopping_rounds=20,
27 verbose=True)
D:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
573 for k, arg in zip(sig.parameters, args):
574 kwargs[k] = arg
--> 575 return f(**kwargs)
576
577 return inner_f
D:\Anaconda\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)
959 xgb_model, eval_metric, params, early_stopping_rounds, callbacks
960 )
--> 961 self._Booster = train(
962 params,
963 train_dmatrix,
D:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
573 for k, arg in zip(sig.parameters, args):
574 kwargs[k] = arg
--> 575 return f(**kwargs)
576
577 return inner_f
D:\Anaconda\lib\site-packages\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)
180 break
181 bst.update(dtrain, i, obj)
--> 182 if cb_container.after_iteration(bst, i, dtrain, evals):
183 break
184
D:\Anaconda\lib\site-packages\xgboost\callback.py in after_iteration(self, model, epoch, dtrain, evals)
237 for _, name in evals:
238 assert name.find('-') == -1, 'Dataset name should not contain `-`'
--> 239 score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)
240 splited = score.split()[1:] # into datasets
241 # split up `test-error:0.1234`
D:\Anaconda\lib\site-packages\xgboost\core.py in eval_set(self, evals, iteration, feval, output_margin)
1860 if feval is not None:
1861 for dmat, evname in evals:
-> 1862 feval_ret = feval(
1863 self.predict(dmat, training=False, output_margin=output_margin), dmat
1864 )
D:\Anaconda\lib\site-packages\xgboost\sklearn.py in inner(y_score, dmatrix)
99 def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
100 y_true = dmatrix.get_label()
--> 101 return func.__name__, func(y_true, y_score)
102 return inner
103
AttributeError: '_PredictScorer' object has no attribute '__name__'
It looks like you've run make_scorer() on your custom metric. Try supplying the original function as eval_metric instead, this should fix the issue.

Error in the Random Forest model fit with GridSearch while using a pipeline()

I am trying to implement ColumnTransformer() and Pipeline() following this document:
https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
I have encountered a ValueError using the prediction pipeline in a grid search.
My data is of mixed types with a binomial target (the 'left' column has values 'yes' or 'no').
department category
promoted int64
review float64
projects int64
salary object
tenure float64
satisfaction float64
bonus int64
avg_hrs_month float64
left object
dtype: object
here is my code.
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
department promoted review projects salary ... left
0 operations 0 0.577569 3 low no
1 operations 0 0.751900 3 medium no
2 support 0 0.722548 3 medium yes
3 logistics 0 0.675158 4 high no
4 sales 0 0.676203 3 high no
ord_features = ["salary","left"]
ordinal_transformer = OrdinalEncoder()
cat_features = ["department"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
preprocessor = ColumnTransformer(
transformers=[
("num", ordinal_transformer, selector(dtype_include="object")),
("cat", categorical_transformer, selector(dtype_include="category")),
]
)
clf = Pipeline(
steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, train_size=0.80, test_size=0.20, random_state=32)
clf.fit(X_train, y_train)
Out:
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num', OrdinalEncoder(),
<sklearn.compose._column_transformer.make_column_selector object at 0x0000028FD9C38CA0>),
('cat',
OneHotEncoder(handle_unknown='ignore'),
<sklearn.compose._column_transformer.make_column_selector object at 0x0000028FD9D863A0>)])),
('classifier', RandomForestClassifier())])
param_grid = {
'max_depth':[30, 40, 50],
'n_estimators':[ 80, 100, 120, 130, 150]
}
grid_search = GridSearchCV(clf, param_grid,scoring='roc_auc', cv=6)
Out:
GridSearchCV(cv=6,
estimator=Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
OrdinalEncoder(),
<sklearn.compose._column_transformer.make_column_selector object at 0x0000028FD9C38CA0>),
('cat',
OneHotEncoder(handle_unknown='ignore'),
<sklearn.compose._column_transformer.make_column_selector object at 0x0000028FD9D863A0>)])),
('classifier',
RandomForestClassifier())]),
param_grid={'max_depth': [30, 40, 50],
'n_estimators': [80, 100, 120, 130, 150]},
scoring='roc_auc')
grid_search.fit(X_train, y_train)
Out:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_36048/20025057.py in <module>
----> 1 grid_search.fit(X_train, y_train)
~\.conda\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
889 return results
890
--> 891 self._run_search(evaluate_candidates)
892
893 # multimetric is determined here because in the case of a callable
~\.conda\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1390 def _run_search(self, evaluate_candidates):
1391 """Search all candidates in param_grid"""
-> 1392 evaluate_candidates(ParameterGrid(self.param_grid))
1393
1394
~\.conda\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
836 )
837
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
~\.conda\envs\tf-gpu\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1046 # remaining jobs.
1047 self._iterating = False
-> 1048 if self.dispatch_one_batch(iterator):
1049 self._iterating = self._original_iterator is not None
1050
~\.conda\envs\tf-gpu\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
864 return False
865 else:
--> 866 self._dispatch(tasks)
867 return True
868
~\.conda\envs\tf-gpu\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
782 with self._lock:
783 job_idx = len(self._jobs)
--> 784 job = self._backend.apply_async(batch, callback=cb)
785 # A job can complete so quickly than its callback is
786 # called before we get here, causing self._jobs to
~\.conda\envs\tf-gpu\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\.conda\envs\tf-gpu\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\.conda\envs\tf-gpu\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\.conda\envs\tf-gpu\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\.conda\envs\tf-gpu\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
209 def __call__(self, *args, **kwargs):
210 with config_context(**self.config):
--> 211 return self.function(*args, **kwargs)
212
213
~\.conda\envs\tf-gpu\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
667 cloned_parameters[k] = clone(v, safe=False)
668
--> 669 estimator = estimator.set_params(**cloned_parameters)
670
671 start_time = time.time()
~\.conda\envs\tf-gpu\lib\site-packages\sklearn\pipeline.py in set_params(self, **kwargs)
186 Pipeline class instance.
187 """
--> 188 self._set_params("steps", **kwargs)
189 return self
190
~\.conda\envs\tf-gpu\lib\site-packages\sklearn\utils\metaestimators.py in _set_params(self, attr, **params)
52 self._replace_estimator(attr, name, params.pop(name))
53 # 3. Step parameters and other initialisation arguments
---> 54 super().set_params(**params)
55 return self
56
~\.conda\envs\tf-gpu\lib\site-packages\sklearn\base.py in set_params(self, **params)
238 key, delim, sub_key = key.partition("__")
239 if key not in valid_params:
--> 240 raise ValueError(
241 "Invalid parameter %s for estimator %s. "
242 "Check the list of available parameters "
ValueError: Invalid parameter max_depth for estimator Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num', OrdinalEncoder(),
<sklearn.compose._column_transformer.make_column_selector object at 0x0000028FD9E1CE80>),
('cat',
OneHotEncoder(handle_unknown='ignore'),
<sklearn.compose._column_transformer.make_column_selector object at 0x0000028FD9EFEEB0>)])),
('classifier', RandomForestClassifier())]). Check the list of available parameters with `estimator.get_params().keys()`.
So it gives me an error after grid_search.fit(X_train, y_train), ValueError: Invalid parameter max_depth for estimator Pipeline.
I have printed out the estimator's list of parameters:
clf.get_params().keys()
dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'classifier', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__categories', 'preprocessor__num__dtype', 'preprocessor__num__handle_unknown', 'preprocessor__num__unknown_value', 'preprocessor__cat__categories', 'preprocessor__cat__drop', 'preprocessor__cat__dtype', 'preprocessor__cat__handle_unknown', 'preprocessor__cat__sparse', 'classifier__bootstrap', 'classifier__ccp_alpha', 'classifier__class_weight', 'classifier__criterion', 'classifier__max_depth', 'classifier__max_features', 'classifier__max_leaf_nodes', 'classifier__max_samples', 'classifier__min_impurity_decrease', 'classifier__min_samples_leaf', 'classifier__min_samples_split', 'classifier__min_weight_fraction_leaf', 'classifier__n_estimators', 'classifier__n_jobs', 'classifier__oob_score', 'classifier__random_state', 'classifier__verbose', 'classifier__warm_start'])
If do a gridsearch without a pipeline, I can get best estimators as follows:
n_estimators = 150, max_depth=30.
What can be wrong with my pipeline?

I was trying to fit and score logistic Regression model but getting error ,Can anyone help me this error

i am trying to experimenting Logistic Regression machine learning models, but i don't know why m i getting error.
models = {"Logistic Regression":LogisticRegression(),}
def fit_and_score(models,x_train,x_test,y_train,y_test):
np.random.seed(42)
model_scores = {}
#loop through model
for name, model in models.items():
model.fit(x_train,y_train)
model_scores[name] = model.score(x_test,y_test)
return model_scores
model_scores = fit_and_score(models=models,
x_train=x_train,
x_test=x_test,
y_train=y_train,
y_test=y_test)
model_scores
AttributeError Traceback (most recent call last)
<ipython-input-33-9c05affc041a> in <module>
----> 1 model_score = fit_and_score(models=models,
2 x_train=x_train,
3 x_test=x_test,
4 y_train=y_train,
5 y_test=y_test)
<ipython-input-32-b7a75c9edc31> in fit_and_score(models, x_train, x_test, y_train, y_test)
21 for name , model in models.items():
22 # fit the model to the data
---> 23 model.fit(x_train,y_train)
24 # Evaluate the model and append it's score to model scores
25 model_scores[name] = model.score(x_test,y_test)
~\Desktop\heart_disease_project\env\lib\site-packages\sklearn\linear_model\_logistic.py
in fit(self, X, y, sample_weight)
1405 else:
1406 prefer = 'processes'
-> 1407 fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
1408 **joblib_parallel_args(prefer=prefer))(
1409 path_func(X, y, pos_class=class, Cs=[C_],
~\Desktop\heart_disease_project\env\lib\site-packages\joblib\parallel.py
in call(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~\Desktop\heart_disease_project\env\lib\site-packages\joblib\parallel.py
in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~\Desktop\heart_disease_project\env\lib\site-packages\joblib\parallel.py
in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~\Desktop\heart_disease_project\env\lib\site-packages\joblib\_parallel_backends.py
in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Desktop\heart_disease_project\env\lib\site-packages\joblib\_parallel_backends.py
in init(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Desktop\heart_disease_project\env\lib\site-packages\joblib\parallel.py
in call(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Desktop\heart_disease_project\env\lib\site-packages\joblib\parallel.py
in (.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
~\Desktop\heart_disease_project\env\lib\site-packages\sklearn\linear_model\_logistic.py
in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept,
max_iter, tol, verbose, solver, coef, class_weight, dual, penalty,
intercept_scaling, multi_class, random_state, check_input,
max_squared_sum, sample_weight, l1_ratio)
760 options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}
761 )
--> 762 n_iter_i = _check_optimize_result(
763 solver, opt_res, max_iter,
764 extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG
~\Desktop\heart_disease_project\env\lib\site-packages\sklearn\utils\optimize.py
in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
I have tried diffrent parameter and it solved the issue.
models = {"Logistic Regression" : LogisticRegression(solver='liblinear'),
"KNN" : KNeighborsClassifier(),
"Random Forest" : RandomForestClassifier()}

scoring "roc_auc" value is not working with gridsearchCV appling RandomForestclassifer

I keep getting this error when perform this with gridsearchCV with scoring value is 'roc_auc'('f1', 'precision','recall' work fine)
# Construct a pipeline
pipe = Pipeline([
('reduce_dim',PCA()),
('rf',RandomForestClassifier(min_samples_leaf=5,random_state=123))
])
N_FEATURES_OPTIONS = [2] # for PCA [2, 4, 8]
# these below param is for RandomForestClassifier
N_ESTIMATORS = [10,50] # 10,50,100
MAX_DEPTH = [5,6] # 5,6,7,8,9
MIN_SAMPLE_LEAF = 5
param_grid = [
{
'reduce_dim': [PCA()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'rf__n_estimators' : N_ESTIMATORS,
'rf__max_depth': MAX_DEPTH
},
{
'reduce_dim': [SelectKBest(f_classif)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'rf__n_estimators' : N_ESTIMATORS,
'rf__max_depth': MAX_DEPTH
},
]
grid = GridSearchCV(pipe, param_grid= param_grid, cv =10,n_jobs=1,scoring = 'roc_auc')
grid.fit(X_train_s,y_train_s)
And I get this error
AttributeError Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
186 try:
--> 187 y_pred = clf.decision_function(X)
188
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in __get__(self, obj, type)
108 else:
--> 109 getattr(delegate, self.attribute_name)
110 break
AttributeError: 'RandomForestClassifier' object has no attribute 'decision_function'
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
<ipython-input-16-86491f3b6aa7> in <module>()
----> 1 grid.fit(X_train_s,y_train_s)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
195
196 if y_type == "binary":
--> 197 y_pred = y_pred[:, 1]
198 elif isinstance(y_pred, list):
199 y_pred = np.vstack([p[:, -1] for p in y_pred]).T
IndexError: index 1 is out of bounds for axis 1 with size 1
I have looked up for this error and found some kind of similar problem here with Kerasclassifier. But I have no idea how to fix it
Keras Wrappers for Scikit Learn - AUC scorer is not working
can anyone explain to me what is wrong???
The error could be because som causes:
If you have only one target class: it fails
If you have >=3 target classes: if fails.
Maybe you have 2 classes, and in one fold of the CV, the test labels are only from one class.
When sklearn compute the AUC metric, it must have 2 classes, because the method for getting the AUC requires only two classes (to compute tpr and fpr with all thresholds).
Example of errors:
grid.fit(np.random.rand(100,2), np.random.randint(1, size=100)) #one class labels
grid.fit(np.random.rand(100,2), np.random.randint(3, size=100)) #3 class labels
#BOTH Throws same error when computing AUC
Example that should not thow an error but it could happen depends of the folds of the CV:
grid.fit(np.random.rand(100,2), np.random.randint(2, size=100)) #two class labels
#This shouldnt throw an error
SOLUTION
If you have more than 2 classes: you have to compute manually (or maybe there are some libraries, but I dont know about it), the 1 vs all, in which you compute auc with 2 classes (one class vs all the others), or All vs All AUC (pairwise AUC, where you compute one class vs ALL being the single class one class at a time, and then calculate the mean).
If you have 2 classes:
grid = GridSearchCV(pipe, param_grid= param_grid, cv = StratifiedKFold(), n_jobs=1, scoring = 'roc_auc')

Error when using GridSearchCV and RandomizedSearchCV

When attempting to fit my training data using either GridSearchCV or RandomizedSearchCV, I keep getting the following error:
TypeError: no supported conversion for types: (dtype('O'), dtype('O'))
Here's a sample of the relevant code:
from xgboost.sklearn import XGBRegressor as XGR
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
xgbRegModel = XGR()
params = {'max_depth':[3, 6, 9], 'learning_rate':[.05, .1, .5], 'n_estimators': [50, 100, 200]}
rscv = RandomizedSearchCV(xgbRegModel, params)
rscv.fit(X, y)
rscv.best_model_
where X is a (39942, 112577) scipy.sparse.csr.csr_matrix and y is a (39942,) numpy.ndarray.
The dtypes are all either int64 or float64, and I've tried running it both with np.nan values and after filling the np.nan values with 0... (I thought that might be the problem, but no.)
Can anyone tell me what's going on here? It works just fine when I train the model without using GridSearchCV or RandomizedSearchCV.
Any ideas would be appreciated - thanks!
ps - the traceback for the error is really long, but here it is, if it helps..
TypeError Traceback (most recent call last)
<ipython-input-54-63d54d4cd03e> in <module>()
3 xgbRegModel = XGR()
4 rscv = RandomizedSearchCV(xgbRegModel, params)
----> 5 rscv.fit(X, y)
6 rscv.best_model_
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
636 error_score=self.error_score)
637 for parameters, (train, test) in product(candidate_params,
--> 638 cv.split(X, y, groups)))
639
640 # if one choose to see train score, "out" will contain train score info
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
425 start_time = time.time()
426
--> 427 X_train, y_train = _safe_split(estimator, X, y, train)
428 X_test, y_test = _safe_split(estimator, X, y, test, train)
429
~\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
198 X_subset = X[np.ix_(indices, train_indices)]
199 else:
--> 200 X_subset = safe_indexing(X, indices)
201
202 if y is not None:
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
160 return X.take(indices, axis=0)
161 else:
--> 162 return X[indices]
163 else:
164 return [X[idx] for idx in indices]
~\Anaconda3\lib\site-packages\scipy\sparse\csr.py in __getitem__(self, key)
315 if isintlike(col) or isinstance(col,slice):
316 P = extractor(row, self.shape[0]) # [[1,2],j] or [[1,2],1:2]
--> 317 extracted = P * self
318 if col == slice(None, None, None):
319 return extracted
~\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
367 if self.shape[1] != other.shape[0]:
368 raise ValueError('dimension mismatch')
--> 369 return self._mul_sparse_matrix(other)
370
371 # If it's a list or whatever, treat it like a matrix
~\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
539 indptr = np.asarray(indptr, dtype=idx_dtype)
540 indices = np.empty(nnz, dtype=idx_dtype)
--> 541 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
542
543 fn = getattr(_sparsetools, self.format + '_matmat_pass2')
~\Anaconda3\lib\site-packages\scipy\sparse\sputils.py in upcast(*args)
49 return t
50
---> 51 raise TypeError('no supported conversion for types: %r' % (args,))
52
53
TypeError: no supported conversion for types: (dtype('O'), dtype('O'))
Thats because GridSearchCV doesn't support sparse matrices in the fit() method. Please have a look at the signature of fit method here:
Parameters:
X : array-like, shape = [n_samples, n_features]
As you see its written that only array-like inputs are supported.
As for why its working normally without grid search, thats because XGBRegressor supports sparse matrices.
The actual error arises when during cross_validation, the X is splitted into train and test which doesn't work for sparse matrices same way as normal arrays.
Also, make sure that for XGBRegressor the sparse matrix is of type CSC and not CSR as you have now, because it will give you wrong results. Its described here: https://github.com/dmlc/xgboost/issues/1238

Resources