Scikit learn custom scoring function - Specificity - scikit-learn

I'm trying to do a random grid search on randomforestclassifier.
# Instantiate a RandomForestClassifier
RFC = RandomForestClassifier()
# Instantiate the RandomizedSearchCV object: RFC
rand_search3 = RandomizedSearchCV(RFC, param_grid, n_iter=10, cv=5,n_jobs=-1, verbose=1, scoring = "f1_macro")
# Fit it to the data
rand_search3.fit(X_train_transformed,y_train)
I'm trying to get the best model by assessing specificity.
Went through the documentation for custom scoring. Also looked at lots of posts that are related. I have came up with 2 ways for the specificity.
1 :
from sklearn.metrics import make_scorer
def my_custom_func(y_true, y_pred):
cm = confusion_matrix(y_true, y_pred)
return cm[1][1] / (cm[1][0] + cm[1][1])
Specificity_score = make_scorer(my_custom_func, greater_is_better=True)
2:
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
specificity = make_scorer(recall_score, pos_label=0, greater_is_better=True)
specificity
When I try using the custom function for the scoring,
rand_search3 = RandomizedSearchCV(RFC, param_grid, n_iter=10, cv=5,n_jobs=-1, verbose=1, scoring = Specificity_score)
rand_search3 = RandomizedSearchCV(RFC, param_grid, n_iter=10, cv=5,n_jobs=-1, verbose=1, scoring = specificity)
both failed with the same error message.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_10548/1204393696.py in <module>
20
21 # Fit it to the data
---> 22 rand_search3.fit(X_train_transformed,y_train)
23
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 #wraps(f)
62 def inner_f(*args, **kwargs):
---> 63 extra_args = len(args) - len(all_args)
64 if extra_args <= 0:
65 return f(*args, **kwargs)
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 delayed(_fit_and_score)(
840 clone(base_estimator),
--> 841 X,
842 y,
843 train=train,
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1631 Mean cross-validated score of the best_estimator.
1632
-> 1633 For multi-metric evaluation, this is not available if ``refit`` is
1634 ``False``. See ``refit`` parameter for more information.
1635
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
825 def evaluate_candidates(candidate_params, cv=None, more_results=None):
826 cv = cv or cv_orig
--> 827 candidate_params = list(candidate_params)
828 n_candidates = len(candidate_params)
829
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _insert_error_scores(results, error_score)
293
294 results = _aggregate_score_dicts(results)
--> 295
296 ret = {}
297 ret["fit_time"] = results["fit_time"]
KeyError: 'fit_failed'
Any solutions?

Related

XGBoost: While using the `eval_set` in .fit causing Error

I'm trying to train the model using Xgboost. The code is doing split using KFold. And for each fold, it's running the Xgboost model using fit. Within the fit function, I'm trying to evaluate both train and valid data to check if the errors. And then doing the prediction in test set.
I'm running the following code using Xgboost.
kf = GroupKFold(n_splits=4)
for trn_idx, test_idx in kf.split(X, groups=X.year) :
x_train, x_valid = X.iloc[trn_idx], X.iloc[test_idx]
y_train, y_valid = y.iloc[trn_idx], y.iloc[test_idx]
xgb_model = xgb.XGBRegressor(
booster = 'dart',
eta = 0.1,
gamma = 0,
colsample_bytree = 0.7,
n_estimators = 1200,
max_depth = 1,
reg_alpha = 1.1,
reg_lambda = 1.1,
subsample = 0.03,
eval_metric=my_smape)
xgb_model.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_valid,y_valid)], early_stopping_rounds=20,
verbose=True)
But I'm getting the following error. I checked this doc, and my code is according to the doc. Can someone please help me find the solution?
AttributeError Traceback (most recent call last)
<ipython-input-38-81b11a21472c> in <module>
23 eval_metric=my_smape)
24
---> 25 xgb_model.fit(x_train, y_train,
26 eval_set=[(x_valid,y_valid)], early_stopping_rounds=20,
27 verbose=True)
D:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
573 for k, arg in zip(sig.parameters, args):
574 kwargs[k] = arg
--> 575 return f(**kwargs)
576
577 return inner_f
D:\Anaconda\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)
959 xgb_model, eval_metric, params, early_stopping_rounds, callbacks
960 )
--> 961 self._Booster = train(
962 params,
963 train_dmatrix,
D:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
573 for k, arg in zip(sig.parameters, args):
574 kwargs[k] = arg
--> 575 return f(**kwargs)
576
577 return inner_f
D:\Anaconda\lib\site-packages\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)
180 break
181 bst.update(dtrain, i, obj)
--> 182 if cb_container.after_iteration(bst, i, dtrain, evals):
183 break
184
D:\Anaconda\lib\site-packages\xgboost\callback.py in after_iteration(self, model, epoch, dtrain, evals)
237 for _, name in evals:
238 assert name.find('-') == -1, 'Dataset name should not contain `-`'
--> 239 score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)
240 splited = score.split()[1:] # into datasets
241 # split up `test-error:0.1234`
D:\Anaconda\lib\site-packages\xgboost\core.py in eval_set(self, evals, iteration, feval, output_margin)
1860 if feval is not None:
1861 for dmat, evname in evals:
-> 1862 feval_ret = feval(
1863 self.predict(dmat, training=False, output_margin=output_margin), dmat
1864 )
D:\Anaconda\lib\site-packages\xgboost\sklearn.py in inner(y_score, dmatrix)
99 def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
100 y_true = dmatrix.get_label()
--> 101 return func.__name__, func(y_true, y_score)
102 return inner
103
AttributeError: '_PredictScorer' object has no attribute '__name__'
It looks like you've run make_scorer() on your custom metric. Try supplying the original function as eval_metric instead, this should fix the issue.

How to use SHAP with a linear SVC model from sklearn using Pipeline?

I am doing text classification using a linear SVC model from sklearn. Now I want to visualize which words/tokens have the highest impact on the classification decision by using SHAP (https://github.com/slundberg/shap).
Right now this does not work because I am getting an error that seems to originate from the vectorizer step in the pipeline I have defined - whats wrong here?
Is my general approach on how to use SHAP in this case correct?
x_Train, x_Test, y_Train, y_Test = train_test_split(df_all['PDFText'], df_all['class'], test_size = 0.2, random_state = 1234)
pipeline = Pipeline([
(
'tfidv',
TfidfVectorizer(
ngram_range=(1,3),
analyzer='word',
strip_accents = ascii,
use_idf = True,
sublinear_tf=True,
max_features=6000,
min_df=2,
max_df=1.0
)
),
(
'lin_svc',
svm.SVC(
C=1.0,
probability=True,
kernel='linear'
)
)
])
pipeline.fit(x_Train, y_Train)
shap.initjs()
explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
shap_values = explainer.shap_values(x_Test, nsamples=100)
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_Test.iloc[0,:])
This is the error message I get:
Provided model function fails when applied to the provided data set.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-81-4bca63616b3b> in <module>
3
4 # use Kernel SHAP to explain test set predictions
----> 5 explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
6 shap_values = explainer.shap_values(x_Test, nsamples=100)
7
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\explainers\kernel.py in __init__(self, model, data, link, **kwargs)
95 self.keep_index_ordered = kwargs.get("keep_index_ordered", False)
96 self.data = convert_to_data(data, keep_index=self.keep_index)
---> 97 model_null = match_model_to_data(self.model, self.data)
98
99 # enforce our current input type limitations
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\common.py in match_model_to_data(model, data)
80 out_val = model.f(data.convert_to_df())
81 else:
---> 82 out_val = model.f(data.data)
83 except:
84 print("Provided model function fails when applied to the provided data set.")
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\pipeline.py in predict_proba(self, X)
379 for name, transform in self.steps[:-1]:
380 if transform is not None:
--> 381 Xt = transform.transform(Xt)
382 return self.steps[-1][-1].predict_proba(Xt)
383
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents, copy)
1631 check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
1632
-> 1633 X = super(TfidfVectorizer, self).transform(raw_documents)
1634 return self._tfidf.transform(X, copy=False)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents)
1084
1085 # use the same matrix-building strategy as fit_transform
-> 1086 _, X = self._count_vocab(raw_documents, fixed_vocab=True)
1087 if self.binary:
1088 X.data.fill(1)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
940 for doc in raw_documents:
941 feature_counter = {}
--> 942 for feature in analyze(doc):
943 try:
944 feature_idx = vocabulary[feature]
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
326 tokenize)
327 return lambda doc: self._word_ngrams(
--> 328 tokenize(preprocess(self.decode(doc))), stop_words)
329
330 else:
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
KernelExplainer expects to receive a classification model as the first argument. Please check the use of Pipeline with Shap following the link.
In your case, you can use the Pipeline as follows:
x_Train = pipeline.named_steps['tfidv'].fit_transform(x_Train)
explainer = shap.KernelExplainer(pipeline.named_steps['lin_svc'].predict_proba, x_Train)

How can creat a path to my data for my CNN in jupyter notebook

Intro and setup
So I have been for some time now trying to make a simple Convolution Neural Network. I followed a simple tutorial, which can be found Here's a link!
It is a simple cat vs dog test (2 categories)
I have set my jupyter/tensorflow/keras up in
C:\Users\labadmin
What i have understood is that i just have to put the path from labadmin in order to implement my data for testing and training.
Since i am not sure what is causing the error i have pasted the whole code and error, i think it is about the system not getting the data.
The folder with the Data set-up as following:
labadmin has a folder called data withing that there are two folders
training
test
Both cat images and dog images are shuffled in both folders. There are 10000+ pictures in each folder, so there should be enough,
This is my code:
from keras.models import Sequential
from keras.layers import Convolution2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
classifier = Sequential()
classifier.add(Convolution2D(32, 3, 3, input_shape = (64, 64, 3), activation = 'relu'))
classifier.add(MaxPooling2D(pool_size = (2,2)))
classifier.add(Flatten())
classifier.add(Dense(output_dim = 128, activation = 'relu'))
classifier.add(Dense(output_dim = 1, activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])
import pandas as pd
from keras.preprocessing.image import ImageDataGenerator
train_datagen = ImageDataGenerator(
rescale=1./255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
test_datagen = ImageDataGenerator(rescale=1./255)
training_set = train_datagen.flow_from_directory(
'data\\training',
target_size=(64, 64),
batch_size=32,
class_mode='categorical',
shuffle=False)
test_set = test_datagen.flow_from_directory(
'data\\test',
target_size=(64, 64),
batch_size=32,
class_mode='categorical',
shuffle=False)
from IPython.display import display
from PIL import Image
classifier.fit_generator(
training_set,
steps_per_epoch=8000,
epochs=10,
validation_data = test_set,
validation_steps = 800)
import numpy as np
from keras_preprocessing import image
test_image = image.load_img('data\\random.jpg', target_size=(64, 64))
test_image = image.img_to_array(test_image)
test_image = np.expand_dims(test_image, axis = 0)
result = classifier.predict(test_image)
training_set.class_indices
if result[0][0]>= 0.5:
prediction = 'dog'
else:
prediction = 'cat'
print(prediction)
I get the following error:
C:\Users\labadmin\Miniconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py:26: UserWarning: Update your `Conv2D` call to the Keras 2 API: `Conv2D(32, (3, 3), input_shape=(64, 64, 3..., activation="relu")`
C:\Users\labadmin\Miniconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py:35: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(activation="relu", units=128)`
C:\Users\labadmin\Miniconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py:36: UserWarning: Update your `Dense` call to the Keras 2 API: `Dense(activation="sigmoid", units=1)`
Found 0 images belonging to 0 classes.
Found 0 images belonging to 0 classes.
Epoch 1/10
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
<ipython-input-5-393aaba195e9> in <module>
82 epochs=10,
83 validation_data = test_set,
---> 84 validation_steps = 800)
85
86 # Our image we now send through to test
~\Miniconda3\envs\tensorflow\lib\site-packages\keras\legacy\interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
~\Miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1416 use_multiprocessing=use_multiprocessing,
1417 shuffle=shuffle,
-> 1418 initial_epoch=initial_epoch)
1419
1420 #interfaces.legacy_generator_methods_support
~\Miniconda3\envs\tensorflow\lib\site-packages\keras\engine\training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
179 batch_index = 0
180 while steps_done < steps_per_epoch:
--> 181 generator_output = next(output_generator)
182
183 if not hasattr(generator_output, '__len__'):
~\Miniconda3\envs\tensorflow\lib\site-packages\keras\utils\data_utils.py in get(self)
707 "`use_multiprocessing=False, workers > 1`."
708 "For more information see issue #1638.")
--> 709 six.reraise(*sys.exc_info())
~\Miniconda3\envs\tensorflow\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\Miniconda3\envs\tensorflow\lib\site-packages\keras\utils\data_utils.py in get(self)
683 try:
684 while self.is_running():
--> 685 inputs = self.queue.get(block=True).get()
686 self.queue.task_done()
687 if inputs is not None:
~\Miniconda3\envs\tensorflow\lib\multiprocessing\pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
~\Miniconda3\envs\tensorflow\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
117 job, i, func, args, kwds = task
118 try:
--> 119 result = (True, func(*args, **kwds))
120 except Exception as e:
121 if wrap_exception and func is not _helper_reraises_exception:
~\Miniconda3\envs\tensorflow\lib\site-packages\keras\utils\data_utils.py in next_sample(uid)
624 The next value of generator `uid`.
625 """
--> 626 return six.next(_SHARED_SEQUENCES[uid])
627
628
~\Miniconda3\envs\tensorflow\lib\site-packages\keras_preprocessing\image\iterator.py in __next__(self, *args, **kwargs)
98
99 def __next__(self, *args, **kwargs):
--> 100 return self.next(*args, **kwargs)
101
102 def next(self):
~\Miniconda3\envs\tensorflow\lib\site-packages\keras_preprocessing\image\iterator.py in next(self)
107 """
108 with self.lock:
--> 109 index_array = next(self.index_generator)
110 # The transformation of images is not under thread lock
111 # so it can be done in parallel
~\Miniconda3\envs\tensorflow\lib\site-packages\keras_preprocessing\image\iterator.py in _flow_index(self)
83 self._set_index_array()
84
---> 85 current_index = (self.batch_index * self.batch_size) % self.n
86 if self.n > current_index + self.batch_size:
87 self.batch_index += 1
ZeroDivisionError: integer division or modulo by zero
Thank you for your time.
Did you populate your data\\training and data\\test directories? From the output:
Found 0 images belonging to 0 classes.
Found 0 images belonging to 0 classes.
Epoch 1/10
it appears that your data augmentation generator did not find any images and the resulting dataset is empty; consequently, when Keras tries to run the fit_generator, you get the division by 0 error as it tries to iterate through your null image set.

make custom scorer with GridSearchCV

I have the code below where I’m trying to use a custom scorer I defined “custom_loss_five” with GridSearchCV to tune hyper parameters. I have the example code below. I also have some sample data. I’m getting an error 'numpy.dtype' object has no attribute 'base_dtype’. I think this is because I’m mixing keras code with sklearn. I’m also using this same “custom_loss_five” function to train a neural network. So that’s why I used keras. If anyone could point out the issue and let me know how to adapt the function to use with GridSearchCV I would appreciate it.
sample data:
print(x_train_scld[:5])
[[ 0.37773519 2.0109691 0.49644224 0.21679945 0.538941 1.99144889
2.15011467 1.20312084 0.86114816 0.79507318 -0.45602028 0.07146743
-0.19524294 -0.33405545 -0.60264522 1.26724727 1.44991588 0.74630967
0.16529837 0.89613455 0.3253014 2.19166429 0.64865429 0.12894674
0.46995314 3.41479052 4.44308499 1.83182458 1.54348561 2.50155582]
[ 0.32029317 0.1214269 0.28824456 0.13510828 -0.0851059 -0.0057386
-0.31671716 0.0303454 0.32754165 -0.15354084 -0.36310852 -0.34419771
-0.28347519 -0.28927174 -0.39507256 -0.2039463 -0.49919802 0.12281647
-0.56756272 -0.30637335 0.10701249 0.21461633 0.17531634 -0.04414507
0.19574444 0.36354262 -1.23318869 0.59029124 0.28936372 0.19248437]
[ 0.25843254 0.29037034 0.21339798 0.12738073 0.28185716 -0.47995085
-0.13321816 0.14228058 -3.69915162 -0.10246162 0.26193423 0.12807553
0.18956053 0.12487671 -0.28174435 -0.71770499 -0.34455425 0.00729992
-0.70102685 -0.57022389 0.59171701 0.77319193 0.52065985 -1.37655715
0.59387438 -1.52826854 0.18054306 0.76212977 0.3639211 0.08726502]
[-0.70482588 -0.32963569 -0.74849491 -0.86505667 0.10026287 -0.87877366
-1.06584707 -1.19559926 0.34039964 0.10112554 -0.62427503 -0.3134676
-0.65996358 -0.52932857 0.11989554 -0.95345177 -0.67459484 -0.82130922
-0.52228025 -0.38191412 -0.75239269 -0.31180246 -0.7418967 -0.7432583
0.12191902 -0.97620932 -1.02049823 -1.20098216 -0.02333216 -0.24853266]
[-0.36680171 -0.14757043 -0.41413663 -0.56754624 -0.34512544 -0.76162172
-0.72684687 -0.61557149 0.31896966 -0.25351016 -0.6357623 0.12484078
-0.71632135 -0.51097128 0.26933611 -0.53549047 -0.54070413 -0.36472263
-0.24581883 -0.67901706 -0.44128802 0.16221265 -0.42239358 -0.52459003
0.34339528 -0.43064345 -1.23318869 -0.23310168 0.44404246 -0.40964978]]
print(x_test_scld[:5])
[[ 2.60641850e-01 -7.18369636e-01 3.27138629e-01 -1.76172773e+00
4.67645320e-01 1.53766591e+00 7.62837058e-01 4.07109050e-01
7.71142242e-01 9.80417766e-01 5.10262027e-01 5.66383900e-01
9.28678845e-01 2.06576727e-01 9.68389151e-01 1.48288576e+00
7.53349504e-01 7.04842193e-01 7.80186706e-01 6.43850055e-01
1.43107505e-01 -7.20312971e-01 2.96065817e-01 -4.51322867e-02
1.93107816e-01 7.41280492e-01 3.28514299e-01 4.47039330e-02
1.39136160e-01 4.94989991e-01]
[-7.51730115e-02 4.92568820e-02 -7.29146850e-02 -2.86318841e-01
1.00026599e+00 4.43886212e-01 4.80336890e-01 6.71683119e-01
8.61148159e-01 5.21434522e-01 -3.65135682e-01 -4.32021118e-01
-4.10049198e-01 -3.01778906e-01 -4.27568719e-02 -1.34413479e+00
-4.09570872e-02 1.64283954e-01 -3.04209384e-01 -7.10176931e-03
7.32148655e-03 -2.90459367e+00 2.31719950e-02 -1.37655715e+00
1.44286672e+00 1.07281572e+00 1.19548020e+00 1.44805187e+00
1.33316704e+00 1.55622575e+00]
[-1.23777794e-01 -3.83763205e-01 -1.65737513e-01 -3.43999436e-01
3.58604868e-01 -3.45623859e-01 -2.89602186e-01 -3.38277511e-01
8.23494778e-03 2.97415674e-01 -6.27653637e-01 -6.42441486e-01
-7.17707195e-01 -4.34516210e-01 6.01100047e-01 -2.64325075e-01
-2.31751338e-01 4.13624916e-02 7.46820672e-01 3.84336779e-01
-3.24408912e-01 -5.30945125e-01 -3.14685046e-01 -4.13363730e-01
6.43970206e-01 -2.37091815e-01 -1.45963962e-01 -2.97594271e-02
7.54512744e-01 6.49530907e-01]
[ 1.06041146e+00 3.61350612e-02 9.93240469e-01 1.11126264e+00
-2.54537983e-01 -2.50709092e-01 -3.56042668e-02 -1.19559926e+00
-2.25351836e-01 -4.65124054e-01 -4.64466800e-01 -1.10808348e+00
-4.47005113e-01 -2.07571731e-01 -1.11908130e+00 -8.49190558e-01
-5.40704133e-01 -6.40037086e-01 -1.10737748e+00 -9.30940117e-01
9.76730527e-01 2.34863210e-01 9.02228200e-01 9.43399666e-01
-1.25487123e-02 -1.70804996e-03 4.83277659e-01 7.07714236e-01
5.60886115e-01 -4.38009686e-01]
[ 3.57851416e-01 1.87811066e+00 2.77785646e-01 2.23975029e-01
-3.66933526e-01 -9.49100986e-01 -4.74866806e-01 -4.98802740e-01
2.69680706e-01 -5.60715159e-01 2.46392629e-01 7.53999293e-01
1.19344293e-01 1.24473258e-01 4.50284535e-02 -5.74844494e-01
-1.80203418e-01 -2.89340672e-01 1.37362545e+00 -6.91305992e-01
2.80612333e-01 1.49136056e+00 1.99466234e-01 1.55930637e-01
-2.39298218e-01 -9.12274848e-01 -4.82659170e-01 -6.00406523e-01
5.90931626e-01 -7.55722792e-01]]
print(y_train[:5])
562 1
291 0
16 1
546 0
293 0
Name: diagnosis, dtype: int64
print(y_test[:5])
421 0
47 1
292 0
186 1
414 1
Name: diagnosis, dtype: int64
Code:
# custom loss function
# importing libraries
import io
import os
import time
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
import keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, precision_recall_fscore_support, accuracy_score
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
# from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
# custom loss function
def custom_loss_wrapper(fn_cost=1, fp_cost=1):
def custom_loss(y_true, y_pred, fn_cost=fn_cost, fp_cost=fp_cost):
h = K.ones_like(y_pred)
fn_value = fn_cost * h
fp_value = fp_cost * h
weighted_values = y_true * K.abs(1-y_pred)*fn_value + (1-y_true) * K.abs(y_pred)*fp_value
loss = K.mean(weighted_values)
return loss
return custom_loss
custom_loss_five = custom_loss_wrapper(fn_cost=5, fp_cost=1)
# TODO: Initialize the classifier
clf = AdaBoostClassifier(random_state=0)
# TODO: Create the parameters list you wish to tune
parameters = {'n_estimators':[100,200,300],'learning_rate':[1.0,2.0,4.0]}
# TODO: Make an fbeta_score scoring object
# scorer = make_scorer(fbeta_score, beta=0.5)
scorer2 = make_scorer(custom_loss_five)
# TODO: Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj2 = GridSearchCV(clf,parameters,scoring=scorer2)
# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_fit2 = grid_obj2.fit(x_train_scld,y_train)
# Get the estimator
best_clf2 = grid_fit2.best_estimator_
# Make predictions using the unoptimized and model
predictions = (clf.fit(x_train_scld, y_train)).predict(x_test_scld)
best_predictions = best_clf.predict(x_test_scld)
# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
# print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
# print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))
error:
/Users/sshields/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
warnings.warn(CV_WARNING, FutureWarning)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-34-b87eab01e7ec> in <module>()
24
25 # TODO: Fit the grid search object to the training data and find the optimal parameters
---> 26 grid_fit2 = grid_obj2.fit(x_train_scld,y_train)
27
28 # Get the estimator
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
566 fit_time = time.time() - start_time
567 # _score will return dict if is_multimetric is True
--> 568 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
569 score_time = time.time() - start_time - fit_time
570 if return_train_score:
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
603 """
604 if is_multimetric:
--> 605 return _multimetric_score(estimator, X_test, y_test, scorer)
606 else:
607 if y_test is None:
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
633 score = scorer(estimator, X_test)
634 else:
--> 635 score = scorer(estimator, X_test, y_test)
636
637 if hasattr(score, 'item'):
~/anaconda2/envs/py36/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, estimator, X, y_true, sample_weight)
96 else:
97 return self._sign * self._score_func(y_true, y_pred,
---> 98 **self._kwargs)
99
100
<ipython-input-4-afa574df52f0> in custom_loss(y_true, y_pred, fn_cost, fp_cost)
11 weighted_values = y_true * K.abs(1-y_pred)*fn_value + (1-y_true) * K.abs(y_pred)*fp_value
12
---> 13 loss = K.mean(weighted_values)
14 return loss
15
~/anaconda2/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in mean(x, axis, keepdims)
1377 A tensor with the mean of elements of `x`.
1378 """
-> 1379 if x.dtype.base_dtype == tf.bool:
1380 x = tf.cast(x, floatx())
1381 return tf.reduce_mean(x, axis, keepdims)
AttributeError: 'numpy.dtype' object has no attribute 'base_dtype'
The custom scoring function need not has to be a Keras function.
Here is a working example.
from sklearn import svm, datasets
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
def custom_loss(y_true, y_pred):
fn_cost, fp_cost = 5, 1
h = np.ones(len(y_pred))
fn_value = fn_cost * h
fp_value = fp_cost * h
weighted_values = y_true * np.abs(1-y_pred)*fn_value + (1-y_true) * np.abs(y_pred)*fp_value
loss = np.mean(weighted_values)
return loss
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5,scoring= make_scorer(custom_loss, greater_is_better=True))
clf.fit(iris.data, iris.target)

scoring "roc_auc" value is not working with gridsearchCV appling RandomForestclassifer

I keep getting this error when perform this with gridsearchCV with scoring value is 'roc_auc'('f1', 'precision','recall' work fine)
# Construct a pipeline
pipe = Pipeline([
('reduce_dim',PCA()),
('rf',RandomForestClassifier(min_samples_leaf=5,random_state=123))
])
N_FEATURES_OPTIONS = [2] # for PCA [2, 4, 8]
# these below param is for RandomForestClassifier
N_ESTIMATORS = [10,50] # 10,50,100
MAX_DEPTH = [5,6] # 5,6,7,8,9
MIN_SAMPLE_LEAF = 5
param_grid = [
{
'reduce_dim': [PCA()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'rf__n_estimators' : N_ESTIMATORS,
'rf__max_depth': MAX_DEPTH
},
{
'reduce_dim': [SelectKBest(f_classif)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'rf__n_estimators' : N_ESTIMATORS,
'rf__max_depth': MAX_DEPTH
},
]
grid = GridSearchCV(pipe, param_grid= param_grid, cv =10,n_jobs=1,scoring = 'roc_auc')
grid.fit(X_train_s,y_train_s)
And I get this error
AttributeError Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
186 try:
--> 187 y_pred = clf.decision_function(X)
188
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in __get__(self, obj, type)
108 else:
--> 109 getattr(delegate, self.attribute_name)
110 break
AttributeError: 'RandomForestClassifier' object has no attribute 'decision_function'
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
<ipython-input-16-86491f3b6aa7> in <module>()
----> 1 grid.fit(X_train_s,y_train_s)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
195
196 if y_type == "binary":
--> 197 y_pred = y_pred[:, 1]
198 elif isinstance(y_pred, list):
199 y_pred = np.vstack([p[:, -1] for p in y_pred]).T
IndexError: index 1 is out of bounds for axis 1 with size 1
I have looked up for this error and found some kind of similar problem here with Kerasclassifier. But I have no idea how to fix it
Keras Wrappers for Scikit Learn - AUC scorer is not working
can anyone explain to me what is wrong???
The error could be because som causes:
If you have only one target class: it fails
If you have >=3 target classes: if fails.
Maybe you have 2 classes, and in one fold of the CV, the test labels are only from one class.
When sklearn compute the AUC metric, it must have 2 classes, because the method for getting the AUC requires only two classes (to compute tpr and fpr with all thresholds).
Example of errors:
grid.fit(np.random.rand(100,2), np.random.randint(1, size=100)) #one class labels
grid.fit(np.random.rand(100,2), np.random.randint(3, size=100)) #3 class labels
#BOTH Throws same error when computing AUC
Example that should not thow an error but it could happen depends of the folds of the CV:
grid.fit(np.random.rand(100,2), np.random.randint(2, size=100)) #two class labels
#This shouldnt throw an error
SOLUTION
If you have more than 2 classes: you have to compute manually (or maybe there are some libraries, but I dont know about it), the 1 vs all, in which you compute auc with 2 classes (one class vs all the others), or All vs All AUC (pairwise AUC, where you compute one class vs ALL being the single class one class at a time, and then calculate the mean).
If you have 2 classes:
grid = GridSearchCV(pipe, param_grid= param_grid, cv = StratifiedKFold(), n_jobs=1, scoring = 'roc_auc')

Resources