I'm trying to train the model using Xgboost. The code is doing split using KFold. And for each fold, it's running the Xgboost model using fit. Within the fit function, I'm trying to evaluate both train and valid data to check if the errors. And then doing the prediction in test set.
I'm running the following code using Xgboost.
kf = GroupKFold(n_splits=4)
for trn_idx, test_idx in kf.split(X, groups=X.year) :
x_train, x_valid = X.iloc[trn_idx], X.iloc[test_idx]
y_train, y_valid = y.iloc[trn_idx], y.iloc[test_idx]
xgb_model = xgb.XGBRegressor(
booster = 'dart',
eta = 0.1,
gamma = 0,
colsample_bytree = 0.7,
n_estimators = 1200,
max_depth = 1,
reg_alpha = 1.1,
reg_lambda = 1.1,
subsample = 0.03,
eval_metric=my_smape)
xgb_model.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_valid,y_valid)], early_stopping_rounds=20,
verbose=True)
But I'm getting the following error. I checked this doc, and my code is according to the doc. Can someone please help me find the solution?
AttributeError Traceback (most recent call last)
<ipython-input-38-81b11a21472c> in <module>
23 eval_metric=my_smape)
24
---> 25 xgb_model.fit(x_train, y_train,
26 eval_set=[(x_valid,y_valid)], early_stopping_rounds=20,
27 verbose=True)
D:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
573 for k, arg in zip(sig.parameters, args):
574 kwargs[k] = arg
--> 575 return f(**kwargs)
576
577 return inner_f
D:\Anaconda\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)
959 xgb_model, eval_metric, params, early_stopping_rounds, callbacks
960 )
--> 961 self._Booster = train(
962 params,
963 train_dmatrix,
D:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
573 for k, arg in zip(sig.parameters, args):
574 kwargs[k] = arg
--> 575 return f(**kwargs)
576
577 return inner_f
D:\Anaconda\lib\site-packages\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)
180 break
181 bst.update(dtrain, i, obj)
--> 182 if cb_container.after_iteration(bst, i, dtrain, evals):
183 break
184
D:\Anaconda\lib\site-packages\xgboost\callback.py in after_iteration(self, model, epoch, dtrain, evals)
237 for _, name in evals:
238 assert name.find('-') == -1, 'Dataset name should not contain `-`'
--> 239 score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)
240 splited = score.split()[1:] # into datasets
241 # split up `test-error:0.1234`
D:\Anaconda\lib\site-packages\xgboost\core.py in eval_set(self, evals, iteration, feval, output_margin)
1860 if feval is not None:
1861 for dmat, evname in evals:
-> 1862 feval_ret = feval(
1863 self.predict(dmat, training=False, output_margin=output_margin), dmat
1864 )
D:\Anaconda\lib\site-packages\xgboost\sklearn.py in inner(y_score, dmatrix)
99 def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
100 y_true = dmatrix.get_label()
--> 101 return func.__name__, func(y_true, y_score)
102 return inner
103
AttributeError: '_PredictScorer' object has no attribute '__name__'
It looks like you've run make_scorer() on your custom metric. Try supplying the original function as eval_metric instead, this should fix the issue.
Related
I am using Foolbox 3.3.1 to perform some adversarial attacks on resnet50 network. The code is as follows:
import torch
from torchvision import models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=True).to(device)
model.eval()
mean = [0.485, 0.456, 0.406]
std=[0.229, 0.224, 0.225]
preprocessing = dict(mean=mean, std=std, axis=-3)
bounds = (0, 1)
fmodel = fb.models.PyTorchModel(model, bounds=bounds, preprocessing=preprocessing)
images, labels = fb.utils.samples(fmodel, dataset='imagenet', batchsize=8)
labels_float = labels.to(torch.float32)
def perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack):
print(f'Performing attack with {type(attack).__name__}...', end='')
raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=0.03)
print('done')
logits_after_attacks = fmodel(clipped)
labels_after_attack = logits_after_attacks.max(dim=1)[1].cpu().numpy()
for image, predicted_label_before_attack, label, label_after_attack in zip(images, predicted_labels_before_attack, labels.cpu().numpy(), labels_after_attack):
label_imshow = type(attack).__name__
if predicted_label_before_attack == label and label != label_after_attack:
label_imshow += '; successful attack'
label_imshow += f'\nTrue class: {lab_dict[label]}\nClassified before attack as: {lab_dict[predicted_label_before_attack]}\nClassified after attack as: {lab_dict[label_after_attack]}'
imshow(image, label_imshow)
for attack in (
fb.attacks.FGSM(), # "nll_loss_forward_no_reduce_cuda_kernel_index" not implemented for 'Int'
):
perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack)
I get the error:
RuntimeError: "nll_loss_forward_no_reduce_cuda_kernel_index" not implemented for 'Int'
with full stack:
Performing attack with LinfFastGradientAttack...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_1736/3238714708.py in <module>
28 # fb.attacks.BoundaryAttack(), # very slow
29 ):
---> 30 perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack)
~\AppData\Local\Temp/ipykernel_1736/3978727835.py in perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack)
1 def perform_attack(attack, fmodel, images, labels, predicted_labels_before_attack):
2 print(f'Performing attack with {type(attack).__name__}...', end='')
----> 3 raw, clipped, is_adv = attack(fmodel, images, labels, epsilons=0.03)
4 print('done')
5 logits_after_attacks = fmodel(clipped)
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\base.py in __call__(***failed resolving arguments***)
277 success = []
278 for epsilon in real_epsilons:
--> 279 xp = self.run(model, x, criterion, epsilon=epsilon, **kwargs)
280
281 # clip to epsilon because we don't really know what the attack returns;
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\fast_gradient_method.py in run(self, model, inputs, criterion, epsilon, **kwargs)
90 raise ValueError("unsupported criterion")
91
---> 92 return super().run(
93 model=model, inputs=inputs, criterion=criterion, epsilon=epsilon, **kwargs
94 )
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\gradient_descent_base.py in run(***failed resolving arguments***)
90
91 for _ in range(self.steps):
---> 92 _, gradients = self.value_and_grad(loss_fn, x)
93 gradients = self.normalize(gradients, x=x, bounds=model.bounds)
94 x = x + gradient_step_sign * stepsize * gradients
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\gradient_descent_base.py in value_and_grad(self, loss_fn, x)
50 x: ep.Tensor,
51 ) -> Tuple[ep.Tensor, ep.Tensor]:
---> 52 return ep.value_and_grad(loss_fn, x)
53
54 def run(
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\framework.py in value_and_grad(f, t, *args, **kwargs)
350 f: Callable[..., TensorType], t: TensorType, *args: Any, **kwargs: Any
351 ) -> Tuple[TensorType, TensorType]:
--> 352 return t.value_and_grad(f, *args, **kwargs)
353
354
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\tensor\tensor.py in value_and_grad(self, f, *args, **kwargs)
541 self: TensorType, f: Callable[..., TensorType], *args: Any, **kwargs: Any
542 ) -> Tuple[TensorType, TensorType]:
--> 543 return self._value_and_grad_fn(f, has_aux=False)(self, *args, **kwargs)
544
545 #final
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\tensor\pytorch.py in value_and_grad(x, *args, **kwargs)
493 loss, aux = f(x, *args, **kwargs)
494 else:
--> 495 loss = f(x, *args, **kwargs)
496 loss = loss.raw
497 loss.backward()
~\anaconda3\envs\adversarial\lib\site-packages\foolbox\attacks\gradient_descent_base.py in loss_fn(inputs)
40 def loss_fn(inputs: ep.Tensor) -> ep.Tensor:
41 logits = model(inputs)
---> 42 return ep.crossentropy(logits, labels).sum()
43
44 return loss_fn
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\framework.py in crossentropy(logits, labels)
319
320 def crossentropy(logits: TensorType, labels: TensorType) -> TensorType:
--> 321 return logits.crossentropy(labels)
322
323
~\anaconda3\envs\adversarial\lib\site-packages\eagerpy\tensor\pytorch.py in crossentropy(self, labels)
462 raise ValueError("labels must be 1D and must match the length of logits")
463 return type(self)(
--> 464 torch.nn.functional.cross_entropy(self.raw, labels.raw, reduction="none")
465 )
466
~\anaconda3\envs\adversarial\lib\site-packages\torch\nn\functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
2844 if size_average is not None or reduce is not None:
2845 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2846 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
2847
2848
RuntimeError: "nll_loss_forward_no_reduce_cuda_kernel_index" not implemented for 'Int'
Any clue?
i think the problem is on the 3978727835.py file try to change the argument labels to 'labels.to(DEVICE).long()' if you use CUDA unless change it to 'labels.long()'
I wrote code to apply the gridsearch method to a LSTM network built with keras. Everything seems to work fine, but i get some problem with passing the batch_size.
I tried to change the format of batch_size but, as i understand, it must be a tuple.
#LSTM ok
from Methods.LSTM_1HL import LSTM_method
Yhat_train_LSTM, Yhat_test_LSTM = LSTM_method(X_train, X_test, Y_train,
Y_test)
def create_model(optimizer, hl1_nodes, input_shape):
# creation of the NN - Electric Load
# LSTM layers followed by other LSTM layer must have the parameter "return_sequences" set at True
model = Sequential()
model.add(LSTM(units = hl1_nodes , input_shape=input_shape, return_sequences=False))
model.add(Dense(1, activation="linear")) # output layer
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['accuracy'])
model.summary()
return model
def LSTM_method(X_train, X_test, Y_train, Y_test):
# normalize X and Y data
mmsx = MinMaxScaler()
mmsy = MinMaxScaler()
X_train = mmsx.fit_transform(X_train)
X_test = mmsx.transform(X_test)
Y_train = mmsy.fit_transform(Y_train)
Y_test = mmsy.transform(Y_test)
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
# NN for Electric Load
# LSTM Input Shape
time_steps = 1 # number of time-steps you are feeding a sequence (?)
inputs_numb = X_train.shape[1] # number of inputs
input_shape=(time_steps, inputs_numb)
model = KerasRegressor(build_fn=create_model,verbose=1)
#GridSearch code
start=time()
optimizers = ['rmsprop', 'adam']
epochs = np.array([100, 500, 1000])
hl1_nodes = np.array([1, 10, 50])
btcsz = np.array([1,X_train.shape[0]])
param_grid = dict(optimizer=optimizers, hl1_nodes=hl1_nodes, input_shape=input_shape, nb_epoch=epochs,batch_size=btcsz)
scoring = make_scorer(accuracy_score) #in order to use a metric as a scorer
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring = scoring)
grid_result = grid.fit(X_train, Y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
for params, mean_score, scores in grid_result.grid_scores_:
print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
print("total time:",time()-start)
# Predictions - Electric Load
Yhat_train = grid_result.predict(X_train, verbose=0)
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
Yhat_test = grid_result.predict(X_test, verbose=0)
# Denormalization - Electric Load
Yhat_train = mmsy.inverse_transform(Yhat_train)
Yhat_test = mmsy.inverse_transform(Yhat_test)
Y_train = mmsy.inverse_transform(Y_train)
Y_test = mmsy.inverse_transform(Y_test)
return Yhat_train, Yhat_test
Below the error I get:
TypeError Traceback (most recent call last)
in
10 #from Methods.LSTM_1HL import create_model
11
---> 12 Yhat_train_LSTM, Yhat_test_LSTM = LSTM_method(X_train, X_test, Y_train, Y_test)
c:\Users\ER180124\Code\LoadForecasting\Methods\LSTM_1HL.py in LSTM_method(X_train, X_test, Y_train, Y_test)
62 scoring = make_scorer(accuracy_score) #in order to use a metric as a scorer
63 grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring = scoring)
---> 64 grid_result = grid.fit(X_train, Y_train)
65
66 print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
~\.conda\envs\PierEnv\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\.conda\envs\PierEnv\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\.conda\envs\PierEnv\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\.conda\envs\PierEnv\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
915 # remaining jobs.
916 self._iterating = False
--> 917 if self.dispatch_one_batch(iterator):
918 self._iterating = self._original_iterator is not None
919
~\.conda\envs\PierEnv\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~\.conda\envs\PierEnv\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~\.conda\envs\PierEnv\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~\.conda\envs\PierEnv\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~\.conda\envs\PierEnv\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\.conda\envs\PierEnv\lib\site-packages\sklearn\externals\joblib\parallel.py in (.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~\.conda\envs\PierEnv\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
~\.conda\envs\PierEnv\lib\site-packages\keras\wrappers\scikit_learn.py in fit(self, x, y, **kwargs)
139 **self.filter_sk_params(self.build_fn.__call__))
140 else:
--> 141 self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
142
143 loss_name = self.model.loss
c:\Users\ER180124\Code\LoadForecasting\Methods\LSTM_1HL.py in create_model(optimizer, hl1_nodes, input_shape)
19 # LSTM layers followed by other LSTM layer must have the parameter "return_sequences" set at True
20 model = Sequential()
---> 21 model.add(LSTM(units = hl1_nodes , input_shape=input_shape, return_sequences=False))
22 model.add(Dense(1, activation="linear")) # output layer
23 model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['accuracy'])
~\.conda\envs\PierEnv\lib\site-packages\keras\legacy\interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
~\.conda\envs\PierEnv\lib\site-packages\keras\layers\recurrent.py in __init__(self, units, activation, recurrent_activation, use_bias, kernel_initializer, recurrent_initializer, bias_initializer, unit_forget_bias, kernel_regularizer, recurrent_regularizer, bias_regularizer, activity_regularizer, kernel_constraint, recurrent_constraint, bias_constraint, dropout, recurrent_dropout, implementation, return_sequences, return_state, go_backwards, stateful, unroll, **kwargs)
2183 stateful=stateful,
2184 unroll=unroll,
-> 2185 **kwargs)
2186 self.activity_regularizer = regularizers.get(activity_regularizer)
2187
~\.conda\envs\PierEnv\lib\site-packages\keras\layers\recurrent.py in __init__(self, cell, return_sequences, return_state, go_backwards, stateful, unroll, **kwargs)
406 '(tuple of integers, '
407 'one integer per RNN state).')
--> 408 super(RNN, self).__init__(**kwargs)
409 self.cell = cell
410 self.return_sequences = return_sequences
~\.conda\envs\PierEnv\lib\site-packages\keras\engine\base_layer.py in __init__(self, **kwargs)
145 batch_size = None
146 batch_input_shape = (
--> 147 batch_size,) + tuple(kwargs['input_shape'])
148 self.batch_input_shape = batch_input_shape
149
TypeError: 'int' object is not iterable
I do not understand why in the last part of the error message i get: "batch_size = None" while i define a batch size that is a tuple.
Well, I think I got your problem.
When you are doing CV Search, a param grid is generated from your param dictionary using most probably a cross product of possible configurations. Your param dictionary has input_shape of (time_steps, inputs_numb) which is a sequence of two integers actually. So, your input shape parameter is either time_steps or inputs_numb. Which then becomes (None,) + (times_steps) or (None,) + (inputs_numb) in the final line of the stack trace. This is a tuple + int operation so it is not valid. Instead, you want your configuration space to have only one possible input_shape.
What you should do is to convert this line
input_shape=(time_steps, inputs_numb)
to this:
input_shape=[(time_steps, inputs_numb)]
I am doing text classification using a linear SVC model from sklearn. Now I want to visualize which words/tokens have the highest impact on the classification decision by using SHAP (https://github.com/slundberg/shap).
Right now this does not work because I am getting an error that seems to originate from the vectorizer step in the pipeline I have defined - whats wrong here?
Is my general approach on how to use SHAP in this case correct?
x_Train, x_Test, y_Train, y_Test = train_test_split(df_all['PDFText'], df_all['class'], test_size = 0.2, random_state = 1234)
pipeline = Pipeline([
(
'tfidv',
TfidfVectorizer(
ngram_range=(1,3),
analyzer='word',
strip_accents = ascii,
use_idf = True,
sublinear_tf=True,
max_features=6000,
min_df=2,
max_df=1.0
)
),
(
'lin_svc',
svm.SVC(
C=1.0,
probability=True,
kernel='linear'
)
)
])
pipeline.fit(x_Train, y_Train)
shap.initjs()
explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
shap_values = explainer.shap_values(x_Test, nsamples=100)
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_Test.iloc[0,:])
This is the error message I get:
Provided model function fails when applied to the provided data set.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-81-4bca63616b3b> in <module>
3
4 # use Kernel SHAP to explain test set predictions
----> 5 explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
6 shap_values = explainer.shap_values(x_Test, nsamples=100)
7
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\explainers\kernel.py in __init__(self, model, data, link, **kwargs)
95 self.keep_index_ordered = kwargs.get("keep_index_ordered", False)
96 self.data = convert_to_data(data, keep_index=self.keep_index)
---> 97 model_null = match_model_to_data(self.model, self.data)
98
99 # enforce our current input type limitations
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\common.py in match_model_to_data(model, data)
80 out_val = model.f(data.convert_to_df())
81 else:
---> 82 out_val = model.f(data.data)
83 except:
84 print("Provided model function fails when applied to the provided data set.")
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\pipeline.py in predict_proba(self, X)
379 for name, transform in self.steps[:-1]:
380 if transform is not None:
--> 381 Xt = transform.transform(Xt)
382 return self.steps[-1][-1].predict_proba(Xt)
383
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents, copy)
1631 check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
1632
-> 1633 X = super(TfidfVectorizer, self).transform(raw_documents)
1634 return self._tfidf.transform(X, copy=False)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents)
1084
1085 # use the same matrix-building strategy as fit_transform
-> 1086 _, X = self._count_vocab(raw_documents, fixed_vocab=True)
1087 if self.binary:
1088 X.data.fill(1)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
940 for doc in raw_documents:
941 feature_counter = {}
--> 942 for feature in analyze(doc):
943 try:
944 feature_idx = vocabulary[feature]
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
326 tokenize)
327 return lambda doc: self._word_ngrams(
--> 328 tokenize(preprocess(self.decode(doc))), stop_words)
329
330 else:
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
KernelExplainer expects to receive a classification model as the first argument. Please check the use of Pipeline with Shap following the link.
In your case, you can use the Pipeline as follows:
x_Train = pipeline.named_steps['tfidv'].fit_transform(x_Train)
explainer = shap.KernelExplainer(pipeline.named_steps['lin_svc'].predict_proba, x_Train)
I keep getting this error when perform this with gridsearchCV with scoring value is 'roc_auc'('f1', 'precision','recall' work fine)
# Construct a pipeline
pipe = Pipeline([
('reduce_dim',PCA()),
('rf',RandomForestClassifier(min_samples_leaf=5,random_state=123))
])
N_FEATURES_OPTIONS = [2] # for PCA [2, 4, 8]
# these below param is for RandomForestClassifier
N_ESTIMATORS = [10,50] # 10,50,100
MAX_DEPTH = [5,6] # 5,6,7,8,9
MIN_SAMPLE_LEAF = 5
param_grid = [
{
'reduce_dim': [PCA()],
'reduce_dim__n_components': N_FEATURES_OPTIONS,
'rf__n_estimators' : N_ESTIMATORS,
'rf__max_depth': MAX_DEPTH
},
{
'reduce_dim': [SelectKBest(f_classif)],
'reduce_dim__k': N_FEATURES_OPTIONS,
'rf__n_estimators' : N_ESTIMATORS,
'rf__max_depth': MAX_DEPTH
},
]
grid = GridSearchCV(pipe, param_grid= param_grid, cv =10,n_jobs=1,scoring = 'roc_auc')
grid.fit(X_train_s,y_train_s)
And I get this error
AttributeError Traceback (most recent call last)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
186 try:
--> 187 y_pred = clf.decision_function(X)
188
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in __get__(self, obj, type)
108 else:
--> 109 getattr(delegate, self.attribute_name)
110 break
AttributeError: 'RandomForestClassifier' object has no attribute 'decision_function'
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
<ipython-input-16-86491f3b6aa7> in <module>()
----> 1 grid.fit(X_train_s,y_train_s)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
195
196 if y_type == "binary":
--> 197 y_pred = y_pred[:, 1]
198 elif isinstance(y_pred, list):
199 y_pred = np.vstack([p[:, -1] for p in y_pred]).T
IndexError: index 1 is out of bounds for axis 1 with size 1
I have looked up for this error and found some kind of similar problem here with Kerasclassifier. But I have no idea how to fix it
Keras Wrappers for Scikit Learn - AUC scorer is not working
can anyone explain to me what is wrong???
The error could be because som causes:
If you have only one target class: it fails
If you have >=3 target classes: if fails.
Maybe you have 2 classes, and in one fold of the CV, the test labels are only from one class.
When sklearn compute the AUC metric, it must have 2 classes, because the method for getting the AUC requires only two classes (to compute tpr and fpr with all thresholds).
Example of errors:
grid.fit(np.random.rand(100,2), np.random.randint(1, size=100)) #one class labels
grid.fit(np.random.rand(100,2), np.random.randint(3, size=100)) #3 class labels
#BOTH Throws same error when computing AUC
Example that should not thow an error but it could happen depends of the folds of the CV:
grid.fit(np.random.rand(100,2), np.random.randint(2, size=100)) #two class labels
#This shouldnt throw an error
SOLUTION
If you have more than 2 classes: you have to compute manually (or maybe there are some libraries, but I dont know about it), the 1 vs all, in which you compute auc with 2 classes (one class vs all the others), or All vs All AUC (pairwise AUC, where you compute one class vs ALL being the single class one class at a time, and then calculate the mean).
If you have 2 classes:
grid = GridSearchCV(pipe, param_grid= param_grid, cv = StratifiedKFold(), n_jobs=1, scoring = 'roc_auc')
When attempting to fit my training data using either GridSearchCV or RandomizedSearchCV, I keep getting the following error:
TypeError: no supported conversion for types: (dtype('O'), dtype('O'))
Here's a sample of the relevant code:
from xgboost.sklearn import XGBRegressor as XGR
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
xgbRegModel = XGR()
params = {'max_depth':[3, 6, 9], 'learning_rate':[.05, .1, .5], 'n_estimators': [50, 100, 200]}
rscv = RandomizedSearchCV(xgbRegModel, params)
rscv.fit(X, y)
rscv.best_model_
where X is a (39942, 112577) scipy.sparse.csr.csr_matrix and y is a (39942,) numpy.ndarray.
The dtypes are all either int64 or float64, and I've tried running it both with np.nan values and after filling the np.nan values with 0... (I thought that might be the problem, but no.)
Can anyone tell me what's going on here? It works just fine when I train the model without using GridSearchCV or RandomizedSearchCV.
Any ideas would be appreciated - thanks!
ps - the traceback for the error is really long, but here it is, if it helps..
TypeError Traceback (most recent call last)
<ipython-input-54-63d54d4cd03e> in <module>()
3 xgbRegModel = XGR()
4 rscv = RandomizedSearchCV(xgbRegModel, params)
----> 5 rscv.fit(X, y)
6 rscv.best_model_
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
636 error_score=self.error_score)
637 for parameters, (train, test) in product(candidate_params,
--> 638 cv.split(X, y, groups)))
639
640 # if one choose to see train score, "out" will contain train score info
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
425 start_time = time.time()
426
--> 427 X_train, y_train = _safe_split(estimator, X, y, train)
428 X_test, y_test = _safe_split(estimator, X, y, test, train)
429
~\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
198 X_subset = X[np.ix_(indices, train_indices)]
199 else:
--> 200 X_subset = safe_indexing(X, indices)
201
202 if y is not None:
~\Anaconda3\lib\site-packages\sklearn\utils\__init__.py in safe_indexing(X, indices)
160 return X.take(indices, axis=0)
161 else:
--> 162 return X[indices]
163 else:
164 return [X[idx] for idx in indices]
~\Anaconda3\lib\site-packages\scipy\sparse\csr.py in __getitem__(self, key)
315 if isintlike(col) or isinstance(col,slice):
316 P = extractor(row, self.shape[0]) # [[1,2],j] or [[1,2],1:2]
--> 317 extracted = P * self
318 if col == slice(None, None, None):
319 return extracted
~\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
367 if self.shape[1] != other.shape[0]:
368 raise ValueError('dimension mismatch')
--> 369 return self._mul_sparse_matrix(other)
370
371 # If it's a list or whatever, treat it like a matrix
~\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
539 indptr = np.asarray(indptr, dtype=idx_dtype)
540 indices = np.empty(nnz, dtype=idx_dtype)
--> 541 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
542
543 fn = getattr(_sparsetools, self.format + '_matmat_pass2')
~\Anaconda3\lib\site-packages\scipy\sparse\sputils.py in upcast(*args)
49 return t
50
---> 51 raise TypeError('no supported conversion for types: %r' % (args,))
52
53
TypeError: no supported conversion for types: (dtype('O'), dtype('O'))
Thats because GridSearchCV doesn't support sparse matrices in the fit() method. Please have a look at the signature of fit method here:
Parameters:
X : array-like, shape = [n_samples, n_features]
As you see its written that only array-like inputs are supported.
As for why its working normally without grid search, thats because XGBRegressor supports sparse matrices.
The actual error arises when during cross_validation, the X is splitted into train and test which doesn't work for sparse matrices same way as normal arrays.
Also, make sure that for XGBRegressor the sparse matrix is of type CSC and not CSR as you have now, because it will give you wrong results. Its described here: https://github.com/dmlc/xgboost/issues/1238