xgb.train(): TypeError: float() argument must be a string or a number, not 'DMatrix' - python-3.x

When I look at the documentation, the argument is supposed to be a 'DMatrix' (xgboost version 1.5.0).
https://xgboost.readthedocs.io/en/latest/python/python_api.html#:~:text=Customized%20objective%20function.-,Learning%20API,num_boost_round%20(int)%20%E2%80%93%20Number%20of%20boosting%20iterations,-.
Indicates pretty much the same thing for the version I'm using (goto subheading '1.2.2 Python' in document link below):
https://xgboost.readthedocs.io/_/downloads/en/release_1.3.0/pdf/
I don't understand why it is asking for a float argument when it is supposed to be a DMatrix.
I've looked at all the Stack posts that have the string 'TypeError: float() argument must be a string or a number, not...', but none of them include 'DMatrix' and I have not been able to find a solution that I could adapt this particular issue.
The the following is the bit of code that elicits this error (go to 'clf - xgb.train(...)'):
def grid_search(timeout_seconds, cv_splits, num_boost_round):
# Read input data
X, y = preprocessing()
y.replace({1:0,2:1,3:2,4:3,5:4,6:5,7:6,8:7,9:8,10:9,11:10,12:11,13:12,14:13,
15:14,16:15,17:16,18:17,19:18,20:19,21:20,22:21}, inplace = True)
# Create dataframe to collect the results
tests_columns = ["test_nr", "cv_mean", "cv_min", "cv_max", "cv_median", "params"]
test_id = 0
tests = pd.DataFrame(columns=tests_columns)
# Cross validation number of splits
kf = KFold(n_splits=cv_splits)
# Execute until timeout occurs
with timeout(timeout_seconds, exception=RuntimeError):
# Get the grid
grid_iter, keys, length = get_grid_iterable()
try:
# For every element of the grid
for df_grid in grid_iter:
# Prepare a list to collect the scores
score = []
params = dict(zip(keys, df_grid))
# The objective function
params["objective"] = "multi:softprob"
params['num_class'] = 22
print('X.reason_action_converted: ', X.reason_action_converted)
# For each fold, train XGBoost and spit out the results
for train_index, test_index in kf.split(X.values):
# Get X train and X test
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
**# Get y train and y test**
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Convert into DMatrix
d_train = xgb.DMatrix(X_train, label=y_train, missing=np.NaN)
d_valid = xgb.DMatrix(X_test, label=y_test, missing=np.NaN)
d_test = xgb.DMatrix(X_test, missing=np.NaN)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
# Create the classifier using the current grid params. Apply early stopping of 50 rounds
'''clf = xgb.train(params, d_train, boosting_rounds, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)'''
**clf = xgb.train(params, d_train, num_boost_round, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)**
y_hat = clf.predict(d_test)
# Append Scores on the fold kept out
score.append(r2_score(y_test, y_hat))
# Store the result into a dataframe
score_df = pd.DataFrame(columns=tests_columns, data=[
[test_id, np.mean(score), np.min(score), np.max(score), np.median(score),
json.dumps(dict(zip(keys, [str(g) for g in df_grid])))]])
test_id += 1
tests = pd.concat([tests, score_df])
except RuntimeError:
# When timeout occurs an exception is raised and the main cycle is broken
pass
# Spit out the results
tests.to_csv("grid-search.csv", index=False)
print(tests)
**if __name__ == "__main__":
grid_search(timeout_seconds=3600, cv_splits=4, num_boost_round=500)**
The error message:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<command-3902447645915365> in <module>
106
107 if __name__ == "__main__":
--> 108 grid_search(timeout_seconds=3600,
cv_splits=4, num_boost_round=500)
<command-3902447645915365> in grid_search(timeout_seconds, cv_splits, num_boost_round)
84 # Create the classifier using the current grid params. Apply early stopping of 50 rounds
85 '''clf = xgb.train(params,
d_train, boosting_rounds, watchlist,
early_stopping_rounds=50, feval=log_loss,
maximize=True, verbose_eval=10)'''
---> 86 clf = xgb.train(params,
d_train, num_boost_round, watchlist,
early_stopping_rounds=50, feval=log_loss,
maximize=True, verbose_eval=10)
87 y_hat = clf.predict(d_test)
88
/databricks/python/lib/python3.8/site-
packages/xgboost/training.py in train(params, dtrain,
num_boost_round, evals, obj, feval, maximize,
early_stopping_rounds, evals_result, verbose_eval,
xgb_model, callbacks)
204 Booster : a trained booster model
205 """
--> 206 bst = _train_internal(params, dtrain,
207
num_boost_round=num_boost_round,
208 evals=evals,
/databricks/python/lib/python3.8/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks, evals_result, maximize, verbose_eval, early_stopping_rounds)
107 nboost += 1
108 # check evaluation result.
--> 109 if callbacks.after_iteration(bst, i,
dtrain, evals):
110 break
111 # do checkpoint after evaluation, in
case evaluation also updates
/databricks/python/lib/python3.8/site-
packages/xgboost/callback.py in after_iteration(self,
model, epoch, dtrain, evals)
421 for _, name in evals:
422 assert name.find('-') == -1,
'Dataset name should not contain `-`'
--> 423 score = model.eval_set(evals,
epoch, self.metric)
424 score = score.split()[1:] # into
datasets
425 # split up `test-error:0.1234`
/databricks/python/lib/python3.8/site-
packages/xgboost/core.py in eval_set(self, evals,
iteration, feval)
1350 if feval is not None:
1351 for dmat, evname in evals:
-> 1352 feval_ret =
feval(self.predict(dmat, training=False,
1353
output_margin=True), dmat)
1354 if isinstance(feval_ret, list):
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in inner_f(*args,
**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in
zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/databricks/python/lib/python3.8/site-
packages/sklearn/metrics/_classification.py in
log_loss(y_true, y_pred, eps, normalize, sample_weight,
labels)
2184 The logarithm used is the natural logarithm
(base-e).
2185 """
-> 2186 y_pred = check_array(y_pred,
ensure_2d=False)
2187 check_consistent_length(y_pred, y_true,
sample_weight)
2188
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in inner_f(*args,
**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in
zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
/databricks/python/lib/python3.8/site-
packages/sklearn/utils/validation.py in
check_array(array, accept_sparse, accept_large_sparse,
dtype, order, copy, force_all_finite, ensure_2d,
allow_nd, ensure_min_samples, ensure_min_features,
estimator)
636 # make sure we actually converted to
numeric:
637 if dtype_numeric and array.dtype.kind
== "O":
--> 638 array = array.astype(np.float64)
639 if not allow_nd and array.ndim >= 3:
640 raise ValueError("Found array with
dim %d. %s expected <= 2."
TypeError: float() argument must be a string or a number, not 'DMatrix'
I'm using Databricks, Python 3.8.8, and xgboost 1.3.1.
I am trying to adapt code from the following tutorial: Effortless Hyperparameters Tuning with Apache Spark.

Related

XGBoost: While using the `eval_set` in .fit causing Error

I'm trying to train the model using Xgboost. The code is doing split using KFold. And for each fold, it's running the Xgboost model using fit. Within the fit function, I'm trying to evaluate both train and valid data to check if the errors. And then doing the prediction in test set.
I'm running the following code using Xgboost.
kf = GroupKFold(n_splits=4)
for trn_idx, test_idx in kf.split(X, groups=X.year) :
x_train, x_valid = X.iloc[trn_idx], X.iloc[test_idx]
y_train, y_valid = y.iloc[trn_idx], y.iloc[test_idx]
xgb_model = xgb.XGBRegressor(
booster = 'dart',
eta = 0.1,
gamma = 0,
colsample_bytree = 0.7,
n_estimators = 1200,
max_depth = 1,
reg_alpha = 1.1,
reg_lambda = 1.1,
subsample = 0.03,
eval_metric=my_smape)
xgb_model.fit(x_train, y_train,
eval_set=[(x_train, y_train), (x_valid,y_valid)], early_stopping_rounds=20,
verbose=True)
But I'm getting the following error. I checked this doc, and my code is according to the doc. Can someone please help me find the solution?
AttributeError Traceback (most recent call last)
<ipython-input-38-81b11a21472c> in <module>
23 eval_metric=my_smape)
24
---> 25 xgb_model.fit(x_train, y_train,
26 eval_set=[(x_valid,y_valid)], early_stopping_rounds=20,
27 verbose=True)
D:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
573 for k, arg in zip(sig.parameters, args):
574 kwargs[k] = arg
--> 575 return f(**kwargs)
576
577 return inner_f
D:\Anaconda\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)
959 xgb_model, eval_metric, params, early_stopping_rounds, callbacks
960 )
--> 961 self._Booster = train(
962 params,
963 train_dmatrix,
D:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
573 for k, arg in zip(sig.parameters, args):
574 kwargs[k] = arg
--> 575 return f(**kwargs)
576
577 return inner_f
D:\Anaconda\lib\site-packages\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)
180 break
181 bst.update(dtrain, i, obj)
--> 182 if cb_container.after_iteration(bst, i, dtrain, evals):
183 break
184
D:\Anaconda\lib\site-packages\xgboost\callback.py in after_iteration(self, model, epoch, dtrain, evals)
237 for _, name in evals:
238 assert name.find('-') == -1, 'Dataset name should not contain `-`'
--> 239 score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)
240 splited = score.split()[1:] # into datasets
241 # split up `test-error:0.1234`
D:\Anaconda\lib\site-packages\xgboost\core.py in eval_set(self, evals, iteration, feval, output_margin)
1860 if feval is not None:
1861 for dmat, evname in evals:
-> 1862 feval_ret = feval(
1863 self.predict(dmat, training=False, output_margin=output_margin), dmat
1864 )
D:\Anaconda\lib\site-packages\xgboost\sklearn.py in inner(y_score, dmatrix)
99 def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
100 y_true = dmatrix.get_label()
--> 101 return func.__name__, func(y_true, y_score)
102 return inner
103
AttributeError: '_PredictScorer' object has no attribute '__name__'
It looks like you've run make_scorer() on your custom metric. Try supplying the original function as eval_metric instead, this should fix the issue.

ValueError: Unable to create tensor, you should probably activate padding with 'padding=True'

I am trying to evaluate facebook/hubert-base-ls9601 Huggingface pre-trained model after fine-tuning on a private dataset.
I am using facebook/hubert-base-ls9601 pre-trained model, and Wav2vec2 feature extractor, and pooling mode set to mean.
Here's the evaluation code:
test_dataset = load_dataset("csv", data_files={"test": "/content/drive/MyDrive/freelancing/test.csv"}, delimiter="\t")["test"]
def speech_file_to_array_fn(batch):
speech_array, sampling_rate = torchaudio.load(batch["path"])
resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
speech = resampler(speech_array).squeeze().numpy()
batch["speech"] = speech_array
return batch
def predict(batch):
features = feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt", padding=True)
input_values = features.input_values.to(device)
with torch.no_grad():
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
batch["predicted"] = pred_ids
return batch
test_dataset = test_dataset.map(speech_file_to_array_fn)
result = test_dataset.map(predict, batched=True, batch_size=2)
On the last line of code, I encounter the following error block:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py in convert_to_tensors(self, tensor_type)
168 if not is_tensor(value):
--> 169 tensor = as_tensor(value)
170
ValueError: could not broadcast input array from shape (2,220683) into shape (2,)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
12 frames
<ipython-input-73-7bd88adad349> in <module>()
----> 1 result = test_dataset.map(predict, batched=True, batch_size=2)
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in map(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)
1970 new_fingerprint=new_fingerprint,
1971 disable_tqdm=disable_tqdm,
-> 1972 desc=desc,
1973 )
1974 else:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
517 self: "Dataset" = kwargs.pop("self")
518 # apply actual function
--> 519 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
520 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
521 for dataset in datasets:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in wrapper(*args, **kwargs)
484 }
485 # apply actual function
--> 486 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
487 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
488 # re-apply format to the output
/usr/local/lib/python3.7/dist-packages/datasets/fingerprint.py in wrapper(*args, **kwargs)
456 # Call actual function
457
--> 458 out = func(self, *args, **kwargs)
459
460 # Update fingerprint of in-place transforms + update in-place history of transforms
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in _map_single(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, disable_tqdm, desc, cache_only)
2340 indices,
2341 check_same_num_examples=len(input_dataset.list_indexes()) > 0,
-> 2342 offset=offset,
2343 )
2344 except NumExamplesMismatchError:
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples, offset)
2217 if with_rank:
2218 additional_args += (rank,)
-> 2219 processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
2220 if update_data is None:
2221 # Check if the function returns updated examples
/usr/local/lib/python3.7/dist-packages/datasets/arrow_dataset.py in decorated(item, *args, **kwargs)
1912 )
1913 # Use the LazyDict internally, while mapping the function
-> 1914 result = f(decorated_item, *args, **kwargs)
1915 # Return a standard dict
1916 return result.data if isinstance(result, LazyDict) else result
<ipython-input-71-6f845da29c00> in predict(batch)
11
12 def predict(batch):
---> 13 features = feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt", padding=True)
14
15 input_values = features.input_values.to(device)
/usr/local/lib/python3.7/dist-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.py in __call__(self, raw_speech, padding, max_length, truncation, pad_to_multiple_of, return_attention_mask, return_tensors, sampling_rate, **kwargs)
200 truncation=truncation,
201 pad_to_multiple_of=pad_to_multiple_of,
--> 202 return_attention_mask=return_attention_mask,
203 )
204
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_sequence_utils.py in pad(self, processed_features, padding, max_length, truncation, pad_to_multiple_of, return_attention_mask, return_tensors)
230 batch_outputs[key].append(value)
231
--> 232 return BatchFeature(batch_outputs, tensor_type=return_tensors)
233
234 def _pad(
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py in __init__(self, data, tensor_type)
78 def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
79 super().__init__(data)
---> 80 self.convert_to_tensors(tensor_type=tensor_type)
81
82 def __getitem__(self, item: str) -> Union[Any]:
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py in convert_to_tensors(self, tensor_type)
174 raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
175 raise ValueError(
--> 176 "Unable to create tensor, you should probably activate padding "
177 "with 'padding=True' to have batched tensors with the same length."
178 )
ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.
I am working on Google Colab. Those are the environment variables:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1
The padding is already activated in the predict function.
Can you please help me fix it?

Problem in getting the accuracy 'train_score' and 'test_score' using Random Forest regressor

I tried to run the code below, but I have received the error below. I have a problem to get the 'train_score' and 'test_score'. I would be happy if you can advise me how to fix this error.
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import forest
import sklearn.ensemble
mse_train1 = []
mse_test1 = []
num_trees1 = []
train_scores, test_scores = list(), list()
model = RandomForestRegressor (n_estimators = 1 ,min_samples_leaf = 7, n_jobs=-1, oob_score=True,random_state=0)
i = 0
for iter in range(10):
num_trees1.append(iter)
i=+1
model.fit(train_set_RF, train_set_pred)
y_train_predicted = model.predict(train_set_RF)
train_score = model.score(train_set_pred, y_train_predicted)
train_scores.append(train_score)
y_test_predicted = model.predict(test_set_RF)
test_score = model.score(test_set_pred, y_test_predicted)
test_scores.append(test_score)
mse_train = mean_squared_error(train_set_pred, y_train_predicted)
mse_train1.append(mse_train)
mse_test = mean_squared_error(test_set_pred, y_test_predicted)
mse_test1.append(mse_test)
print("Iteration: {} Train mse: {} Test mse: {}".format(iter, mse_train, mse_test))
model.n_estimators += 1
print (train_scores)
print (test_scores)
print (mse_train1)
print (mse_test1)
This more detail about the error I got:
ValueError Traceback (most recent call
last)
<ipython-input-17-ff545aa1896c> in <module>
19 y_train_predicted = model.predict(train_set_RF)
20 #y_train_predicted =
np.nan_to_num(y_train_predicted.astype(np.float32))
---> 21 train_score = model.score(train_set_pred, y_train_predicted)
22
23 #train_acc = accuracy_score(train_set_pred,
y_train_predicted)
~\anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y,
sample_weight)
549
550 from .metrics import r2_score
--> 551 y_pred = self.predict(X)
552 return r2_score(y, y_pred, sample_weight=sample_weight)
553
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in
predict(self, X)
781 check_is_fitted(self)
782 # Check data
--> 783 X = self._validate_X_predict(X)
784
785 # Assign chunk of trees to jobs
~\anaconda3\lib\site-packages\sklearn\ensemble\_forest.py in
_validate_X_predict(self, X)
419 check_is_fitted(self)
420
--> 421 return self.estimators_[0]._validate_X_predict(X,
check_input=True)
422
423 #property
~\anaconda3\lib\site-packages\sklearn\tree\_classes.py in
_validate_X_predict(self, X, check_input)
386 """Validate X whenever one tries to predict, apply,
predict_proba"""
387 if check_input:
--> 388 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
389 if issparse(X) and (X.indices.dtype != np.intc or
390 X.indptr.dtype != np.intc):
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters,
args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in
check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy,
force_all_finite, ensure_2d, allow_nd, ensure_min_samples,
ensure_min_features, estimator)
622 "Reshape your data either using
array.reshape(-1, 1) if "
623 "your data has a single feature or
array.reshape(1, -1) "
--> 624 "if it contains a single sample.".format(array))
625
626 # in the future np.flexible dtypes will be handled like
object dtypes
ValueError: Expected 2D array, got 1D array instead:
array=[0.3119313 0.29728386 0.29309732 ... 0.30558413 0.29317933
0.29755104].
Reshape your data either using array.reshape(-1, 1) if your data has a
single feature or array.reshape(1, -1) if it contains a single sample.

How can I use a keras callback in a sklearn pipeline?

I am trying to create a simple multy-layer perceptron (MLP) using Keras.
In order to avoid data leakage I am using a pipeline in a cross-validation routine.
To do that I have to use a keras wrapper; everything is working fine unless I do not put a TensorBoard callback into the wrapper.
I read tons of stackoverflow answers and it looks that my code is correct but I get the following error:
> RuntimeError: Cannot clone object <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x00000245DD5C2A60>, as the constructor either does not set or modifies parameter callbacks
Below my code:
#Network and training parameters
EPOCHS = 100
BATCH_SIZE = 16
VERBOSE = 0
INPUT_SHAPE = (Xtr.shape[1],)
OUTPUT_SHAPE = 1 #number of outputs
N_HIDDEN = 8
def build_mlp(n_hidden, input_shape, output_shape):
#Build the model
model = tf.keras.models.Sequential()
model.add(keras.layers.Dense(units = n_hidden,
input_shape = input_shape,
name = 'dense_layer_1',
activation = 'relu'))
model.add(keras.layers.Dense(units = output_shape,
name ='output_layer',
activation = 'sigmoid'))
model.compile(optimizer='Adam',
loss='binary_crossentropy',
metrics=['accuracy'])
return model
#TensorBoard
import datetime
LOG_DIR = "logs/MLP_anomaly/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
CALLBACKS = [tf.keras.callbacks.TensorBoard(log_dir = LOG_DIR)]
#create a wrapper to use sklearn pipelines
sk_model = tf.keras.wrappers.scikit_learn.KerasClassifier(build_fn=build_mlp,
epochs=EPOCHS,
batch_size=BATCH_SIZE,
callbacks = CALLBACKS,
verbose=VERBOSE,
n_hidden = N_HIDDEN,
input_shape = INPUT_SHAPE,
output_shape = OUTPUT_SHAPE)
#use a pipeline
pipe = Pipeline([('scaler', MinMaxScaler()), ('mlp', sk_model)])
#cross-validation
n_splits, n_repeats = 3, 1
cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
cv_rslt = cross_validate(pipe, Xtrx, Ytr, cv=cv,
return_train_score = True,
scoring = 'accuracy',
return_estimator = True)
The full error I am getting is:
> ---------------------------------------------------------------------------
Empty Traceback (most recent call last)
~\.conda\envs\PrognosticEnv\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
819 try:
--> 820 tasks = self._ready_batches.get(block=False)
821 except queue.Empty:
~\.conda\envs\PrognosticEnv\lib\queue.py in get(self, block, timeout)
166 if not self._qsize():
--> 167 raise Empty
168 elif timeout is None:
Empty:
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call last)
<ipython-input-12-47de7339b00e> in <module>
2 n_splits, n_repeats = 3, 1
3 cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed)
----> 4 cv_rslt = cross_validate(pipe, Xtrx, Ytr, cv=cv,
5 return_train_score = True,
6 scoring = 'accuracy',
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
240 parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
241 pre_dispatch=pre_dispatch)
--> 242 scores = parallel(
243 delayed(_fit_and_score)(
244 clone(estimator), X, y, scorers, train, test, verbose, None,
~\.conda\envs\PrognosticEnv\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~\.conda\envs\PrognosticEnv\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
829 big_batch_size = batch_size * n_jobs
830
--> 831 islice = list(itertools.islice(iterator, big_batch_size))
832 if len(islice) == 0:
833 return False
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\model_selection\_validation.py in <genexpr>(.0)
242 scores = parallel(
243 delayed(_fit_and_score)(
--> 244 clone(estimator), X, y, scorers, train, test, verbose, None,
245 fit_params, return_train_score=return_train_score,
246 return_times=True, return_estimator=return_estimator,
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\base.py in clone(estimator, safe)
85 new_object_params = estimator.get_params(deep=False)
86 for name, param in new_object_params.items():
---> 87 new_object_params[name] = clone(param, safe=False)
88 new_object = klass(**new_object_params)
89 params_set = new_object.get_params(deep=False)
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\base.py in clone(estimator, safe)
66 # XXX: not handling dictionaries
67 if estimator_type in (list, tuple, set, frozenset):
---> 68 return estimator_type([clone(e, safe=safe) for e in estimator])
69 elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
70 if not safe:
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\base.py in <listcomp>(.0)
66 # XXX: not handling dictionaries
67 if estimator_type in (list, tuple, set, frozenset):
---> 68 return estimator_type([clone(e, safe=safe) for e in estimator])
69 elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
70 if not safe:
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\base.py in clone(estimator, safe)
66 # XXX: not handling dictionaries
67 if estimator_type in (list, tuple, set, frozenset):
---> 68 return estimator_type([clone(e, safe=safe) for e in estimator])
69 elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
70 if not safe:
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\base.py in <listcomp>(.0)
66 # XXX: not handling dictionaries
67 if estimator_type in (list, tuple, set, frozenset):
---> 68 return estimator_type([clone(e, safe=safe) for e in estimator])
69 elif not hasattr(estimator, 'get_params') or isinstance(estimator, type):
70 if not safe:
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\.conda\envs\PrognosticEnv\lib\site-packages\sklearn\base.py in clone(estimator, safe)
94 param2 = params_set[name]
95 if param1 is not param2:
---> 96 raise RuntimeError('Cannot clone object %s, as the constructor '
97 'either does not set or modifies parameter %s' %
98 (estimator, name))
RuntimeError: Cannot clone object <tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x00000245DD5C2A60>, as the constructor either does not set or modifies parameter callbacks
I have already tried putting the callback like this:
pipe.set_params(mlp__callbacks=CALLBACKS);
or putting the callback in the fit_params attribute of the cross_validate function.
Nothing works for me.
Someone have some suggestion?
Thank you very much
So finally I found a solution, actually it is more a workaround.
I write it here wishing that it can be useful for some other ML practictioner.
The explanation of my problem is simple and can be explained in 3 steps:
sklearn do not provide a method to plot the training history of the model. I found something similar to the keras history only in the MLPclassifier that has an attribute loss_
tensorflow and keras do not provide crossvalidation and pipelines routines to avoid data-leakage (since usually in deep learning there is not room for CV)
wrapping a keras MLP using KerasClassifier and putting it in a sklearn pipeline is not useful because it is not possible to extrapolate the history of the classifier of the pipelin (when calling the fit function).
So finally I used the sklearn function plot_validation_curve to create a plot of the MLP loss function in function of the training epochs. In order to avoid data-leakage I used a pipeline and the cross validation method of sklearn.

How to use SHAP with a linear SVC model from sklearn using Pipeline?

I am doing text classification using a linear SVC model from sklearn. Now I want to visualize which words/tokens have the highest impact on the classification decision by using SHAP (https://github.com/slundberg/shap).
Right now this does not work because I am getting an error that seems to originate from the vectorizer step in the pipeline I have defined - whats wrong here?
Is my general approach on how to use SHAP in this case correct?
x_Train, x_Test, y_Train, y_Test = train_test_split(df_all['PDFText'], df_all['class'], test_size = 0.2, random_state = 1234)
pipeline = Pipeline([
(
'tfidv',
TfidfVectorizer(
ngram_range=(1,3),
analyzer='word',
strip_accents = ascii,
use_idf = True,
sublinear_tf=True,
max_features=6000,
min_df=2,
max_df=1.0
)
),
(
'lin_svc',
svm.SVC(
C=1.0,
probability=True,
kernel='linear'
)
)
])
pipeline.fit(x_Train, y_Train)
shap.initjs()
explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
shap_values = explainer.shap_values(x_Test, nsamples=100)
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_Test.iloc[0,:])
This is the error message I get:
Provided model function fails when applied to the provided data set.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-81-4bca63616b3b> in <module>
3
4 # use Kernel SHAP to explain test set predictions
----> 5 explainer = shap.KernelExplainer(pipeline.predict_proba, x_Train)
6 shap_values = explainer.shap_values(x_Test, nsamples=100)
7
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\explainers\kernel.py in __init__(self, model, data, link, **kwargs)
95 self.keep_index_ordered = kwargs.get("keep_index_ordered", False)
96 self.data = convert_to_data(data, keep_index=self.keep_index)
---> 97 model_null = match_model_to_data(self.model, self.data)
98
99 # enforce our current input type limitations
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\shap\common.py in match_model_to_data(model, data)
80 out_val = model.f(data.convert_to_df())
81 else:
---> 82 out_val = model.f(data.data)
83 except:
84 print("Provided model function fails when applied to the provided data set.")
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
116
117 # lambda, but not partial, allows help() to work with update_wrapper
--> 118 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
119 # update the docstring of the returned function
120 update_wrapper(out, self.fn)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\pipeline.py in predict_proba(self, X)
379 for name, transform in self.steps[:-1]:
380 if transform is not None:
--> 381 Xt = transform.transform(Xt)
382 return self.steps[-1][-1].predict_proba(Xt)
383
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents, copy)
1631 check_is_fitted(self, '_tfidf', 'The tfidf vector is not fitted')
1632
-> 1633 X = super(TfidfVectorizer, self).transform(raw_documents)
1634 return self._tfidf.transform(X, copy=False)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in transform(self, raw_documents)
1084
1085 # use the same matrix-building strategy as fit_transform
-> 1086 _, X = self._count_vocab(raw_documents, fixed_vocab=True)
1087 if self.binary:
1088 X.data.fill(1)
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
940 for doc in raw_documents:
941 feature_counter = {}
--> 942 for feature in analyze(doc):
943 try:
944 feature_idx = vocabulary[feature]
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(doc)
326 tokenize)
327 return lambda doc: self._word_ngrams(
--> 328 tokenize(preprocess(self.decode(doc))), stop_words)
329
330 else:
c:\users\s.p\appdata\local\programs\python\python37\lib\site-packages\sklearn\feature_extraction\text.py in <lambda>(x)
254
255 if self.lowercase:
--> 256 return lambda x: strip_accents(x.lower())
257 else:
258 return strip_accents
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
KernelExplainer expects to receive a classification model as the first argument. Please check the use of Pipeline with Shap following the link.
In your case, you can use the Pipeline as follows:
x_Train = pipeline.named_steps['tfidv'].fit_transform(x_Train)
explainer = shap.KernelExplainer(pipeline.named_steps['lin_svc'].predict_proba, x_Train)

Resources