Why does ndcg_score result in nan values? - python-3.x

Consider the following code:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, ndcg_score, make_scorer
from sklearn.svm import SVC
X_data = pd.DataFrame(np.random.randint(0,1,size=(100, 4)), columns=list('ABCD'))
X_data = sp.csr_matrix(X_data.to_numpy())
Y_data = pd.DataFrame(np.random.choice([0,1,5], 100), columns=['Y'])
# Set the parameters by cross-validation
param_grid = {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]}
clf = GridSearchCV(SVC(), param_grid, scoring=ndcg_score, refit=True, verbose=3, n_jobs=-1, error_score='raise')
test = clf.fit(X_data, Y_data)
I am wondering why this would raise the following error:
Fitting 5 folds for each of 8 candidates, totalling 40 fits
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\externals\loky\process_executor.py", line 431, in _process_worker
r = call_item()
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\externals\loky\process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_validation.py", line 625, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
scores = scorer(estimator, X_test, y_test)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 74, in inner_f
return f(**kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\metrics\_ranking.py", line 1564, in ndcg_score
y_true = check_array(y_true, ensure_2d=False)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 710, in check_array
array = array.astype(np.float64)
TypeError: float() argument must be a string or a number, not 'SVC'
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-45-93a8890b095c> in <module>
18
19 clf = GridSearchCV(SVC(), param_grid, scoring=ndcg_score, refit=True, verbose=3, n_jobs=-1, error_score='raise')
---> 20 test = clf.fit(X_data, Y_data)
21 #print(test.best_score_)
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1294 def _run_search(self, evaluate_candidates):
1295 """Search all candidates in param_grid"""
-> 1296 evaluate_candidates(ParameterGrid(self.param_grid))
1297
1298
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\Anaconda3\envs\kaggleSVM\lib\concurrent\futures\_base.py in result(self, timeout)
442 raise CancelledError()
443 elif self._state == FINISHED:
--> 444 return self.__get_result()
445 else:
446 raise TimeoutError()
~\Anaconda3\envs\kaggleSVM\lib\concurrent\futures\_base.py in __get_result(self)
387 if self._exception:
388 try:
--> 389 raise self._exception
390 finally:
391 # Break a reference cycle with the exception in self._exception
TypeError: float() argument must be a string or a number, not 'SVC'
I am not quite sure why this would result in a TypeError.

I cannot recreate the error you are reporting, but using error_score="raise" and n_jobs=1 (not strictly necessary, but the output is a little easier to read), and wrapping ndcg_score with make_scorer with needs_proba=True, I get this one:
Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got multiclass instead
which supports my first comment: NDCG assumes multilabel format. That suggests you need to understand whether NDCG is really appropriate for your task, and if so either turn your problem into a multilabel one or write a custom scorer that converts the multiclass output into a multilabel (one-hot encoded) one before computing the score.

Related

Result type Float can’t be cast to the desired output type Byte

I am trying to make a model to classify knee MRIs. I am using the torchio library and I have succesfully train a classifier to the raw data. Now, I am adding masks to the MRI object and I want to pass them to a model. I use the same code but now I am getting this error:
RuntimeError Traceback (most recent call last)
<ipython-input-56-2a7d5ddf84fc> in <module>()
26 for epoch in range(1, num_epochs+1):
27
---> 28 train_loss, train_accuracy = training(epoch,model,train_loader,optimizer,criterion)
29 valid_loss, valid_accuracy = validation(epoch,model,valid_loader,criterion)
30
6 frames
<ipython-input-55-59a55a26f2b7> in training(epoch, model, train_loader, optimizer, criterion)
8 model.train()
9
---> 10 for i, batch in tqdm(enumerate(train_loader,0)):
11 images = batch['t1'][tio.DATA].cuda()
12 labels = batch['label']
/usr/local/lib/python3.7/dist-packages/tqdm/notebook.py in __iter__(self)
255 def __iter__(self):
256 try:
--> 257 for obj in super(tqdm_notebook, self).__iter__():
258 # return super(tqdm...) will not catch exception
259 yield obj
/usr/local/lib/python3.7/dist-packages/tqdm/std.py in __iter__(self)
1193
1194 try:
-> 1195 for obj in iterable:
1196 yield obj
1197 # Update and possibly print the progressbar.
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
1201 else:
1202 del self._task_info[idx]
-> 1203 return self._process_data(data)
1204
1205 def _try_put_index(self):
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _process_data(self, data)
1227 self._try_put_index()
1228 if isinstance(data, ExceptionWrapper):
-> 1229 data.reraise()
1230 return data
1231
/usr/local/lib/python3.7/dist-packages/torch/_utils.py in reraise(self)
432 # instantiate since we don't know how to
433 raise RuntimeError(msg) from None
--> 434 raise exception
435
436
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 74, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 74, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 56, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: result type Float can't be cast to the desired output type Byte
The training() function is this:
def training(epoch, model, train_loader, optimizer, criterion):
"Training over an epoch"
metric_monitor = MetricMonitor()
model.train()
for i, batch in tqdm(enumerate(train_loader,0)):
images = batch['t1'][tio.DATA].cuda()
labels = batch['label'].cuda()
if images.sum() != 0:
output = F.softmax(model(images), dim =1)
loss = criterion(output, labels)
output = output.data.max(dim=1,keepdim=True)[1]
output = output.view(-1)
acc = calculate_acc(output, labels)
metric_monitor.update("Loss", loss.item())
metric_monitor.update("Accuracy", acc)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print("[Epoch: {epoch:03d}] Train | {metric_monitor}".format(epoch=epoch, metric_monitor=metric_monitor))
return metric_monitor.metrics['Loss']['avg'], metric_monitor.metrics['Accuracy']['avg']
When I run the code some batches pass but it stops on some batches and produces this error. I have checked my data and all of them are torch.FloatTensor
The labels are integers. Does anybody know how to fix this ?

Error CUBLAS_STATUS_NOT_INITIALIZED with nn.Linear

from torch import nn
from transformers.models.bert.modeling_bert import BertConfig
config = BertConfig.from_pretrained("bert-base-uncased")
self.x_head = nn.Linear(in_features=config.hidden_size, out_features=28)
word_embeddings = nn.Embedding( num_embeddings=979, embedding_dim=config.hidden_size ,padding_idx=0,)(ids_1).to(self.device)
vis_embeddings = nn.Embedding( num_embeddings=127, embedding_dim=config.hidden_size, padding_idx=0,)(ids_2).to(self.device)
input_embeds = torch.cat([word_embeddings , vis_embeddings] , dim =1).to(self.device)
in forward() method I have below code:
x_scores = self.x_head(input_embeds)
I am getting below error at x_scores(last line above)
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
166 return self.module(*inputs[0], **kwargs[0])
167 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 168 outputs = self.parallel_apply(replicas, inputs, kwargs)
169 return self.gather(outputs, self.output_device)
170
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
176
177 def parallel_apply(self, replicas, inputs, kwargs):
--> 178 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
179
180 def gather(self, outputs, output_device):
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
84 output = results[i]
85 if isinstance(output, ExceptionWrapper):
---> 86 output.reraise()
87 outputs.append(output)
88 return outputs
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
423 # have message field
424 raise self.exc_type(message=msg)
--> 425 raise self.exc_type(msg)
426
427
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "", line 56, in forward
x_scores = self.x_head(input_embeds)
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 96, in forward
return F.linear(input, self.weight, self.bias)
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/functional.py", line 1847, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling cublasCreate(handle)

issue using imbalanced dataset with logloss and RFECV

I am using imbalanced dataset(54:38:7%) with RFECV for feature selection like this:
# making a multi logloss metric
from sklearn.metrics import log_loss, make_scorer
log_loss_rfe = make_scorer(score_func=log_loss, greater_is_better=False)
# initiating Light GBM classifier
lgb_rfe = LGBMClassifier(objective='multiclass', learning_rate=0.01, verbose=0, force_col_wise=True,
random_state=100, n_estimators=5_000, n_jobs=7)
# initiating RFECV
rfe = RFECV(estimator=lgb_rfe, min_features_to_select=2, verbose=3, n_jobs=2, cv=3, scoring=log_loss_rfe)
# fitting it
rfe.fit(X=X_train, y=y_train)
And I got an error, presumably because the subsamples sklearn's RFECV has made doesn't have all of the classes from my data. I had no issues fitting the very same data outside of RFECV.
Here's the complete error:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker
r = call_item()
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/utils/fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 37, in _rfe_single_fit
return rfe._fit(
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 259, in _fit
self.scores_.append(step_score(estimator, features))
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py", line 39, in <lambda>
lambda estimator, features: _score(
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
scores = scorer(estimator, X_test, y_test)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 199, in __call__
return self._score(partial(_cached_call, None), estimator, X, y_true,
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 242, in _score
return self._sign * self._score_func(y_true, y_pred,
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/home/ubuntu/ds_jup_venv/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 2265, in log_loss
raise ValueError("y_true and y_pred contain different number of "
ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2]
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-9-5feb62a6f457> in <module>
1 rfe = RFECV(estimator=lgb_rfe, min_features_to_select=2, verbose=3, n_jobs=2, cv=3, scoring=log_loss_rfe)
----> 2 rfe.fit(X=X_train, y=y_train)
~/ds_jup_venv/lib/python3.8/site-packages/sklearn/feature_selection/_rfe.py in fit(self, X, y, groups)
603 func = delayed(_rfe_single_fit)
604
--> 605 scores = parallel(
606 func(rfe, self.estimator, X, y, train, test, scorer)
607 for train, test in cv.split(X, y, groups))
~/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/ds_jup_venv/lib/python3.8/site-packages/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/ds_jup_venv/lib/python3.8/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
1 frames
/usr/lib/python3.8/concurrent/futures/_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2]
How to fix this to be able to select features recursively?
Log-loss needs the probability predictions, not the class predictions, so you should add
log_loss_rfe = make_scorer(score_func=log_loss, needs_proba=True, greater_is_better=False)
The error is because without that, the passed y_pred is one-dimensional (classes 0,1,2) and sklearn assumes it's a binary classification problem and those predictions are probability of the positive class. To deal with that, it adds on the probability of the negative class, but then there are only two columns compared to your three classes.
Consider applying stratified cross-validation, which will try to preserve the fraction of samples for each class. Experiment with one of these scikit-learn cross-validators:
sklearn.model_selection.StratifiedKFold,
StratifiedShuffleSplit,
RepeatedStratifiedKFold, replacing cv=3 in your RFECV with the chosen cross-validator.
Edit
I have missed the fact that StratifiedKFold is the default cross-validator in RFECV. Actually, the error is related to log_loss_rfe, which was defined with needs_proba=False. Credit to #BenReiniger!

Hugging face - RuntimeError: Caught RuntimeError in replica 0 on device 0 on Azure Databricks

How do I run the run_language_modeling.py script from hugging face using the pretrained roberta case model to fine-tune using my own data on the Azure databricks with a GPU cluster.
Using Transformer version 2.9.1 and 3.0 .
Python 3.6
Torch `1.5.0
torchvision 0.6
This is the script I ran below on Azure databricks
%run '/dbfs/FileStore/tables/dev/run_language_modeling.py' \
--output_dir='/dbfs/FileStore/tables/final_train/models/roberta_base_reduce_n' \
--model_type=roberta \
--model_name_or_path=roberta-base \
--do_train \
--num_train_epochs 5 \
--train_data_file='/dbfs/FileStore/tables/final_train/train_data/all_data_desc_list_full.txt' \
--mlm
This is the error I get after running the above command.
/dbfs/FileStore/tables/dev/run_language_modeling.py in <module>
279
280 if __name__ == "__main__":
--> 281 main()
/dbfs/FileStore/tables/dev/run_language_modeling.py in main()
243 else None
244 )
--> 245 trainer.train(model_path=model_path)
246 trainer.save_model()
247 # For convenience, we also re-save the tokenizer to the same directory,
/databricks/python/lib/python3.7/site-packages/transformers/trainer.py in train(self, model_path)
497 continue
498
--> 499 tr_loss += self._training_step(model, inputs, optimizer)
500
501 if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
/databricks/python/lib/python3.7/site-packages/transformers/trainer.py in _training_step(self, model, inputs, optimizer)
620 inputs["mems"] = self._past
621
--> 622 outputs = model(**inputs)
623 loss = outputs[0] # model outputs are always tuple in transformers (see doc)
624
/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
153 return self.module(*inputs[0], **kwargs[0])
154 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 155 outputs = self.parallel_apply(replicas, inputs, kwargs)
156 return self.gather(outputs, self.output_device)
157
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
163
164 def parallel_apply(self, replicas, inputs, kwargs):
--> 165 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
166
167 def gather(self, outputs, output_device):
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
83 output = results[i]
84 if isinstance(output, ExceptionWrapper):
---> 85 output.reraise()
86 outputs.append(output)
87 return outputs
/databricks/python/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
393 # (https://bugs.python.org/issue2651), so we work around it.
394 msg = KeyErrorMessage(msg)
--> 395 raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_roberta.py", line 239, in forward
output_hidden_states=output_hidden_states,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 762, in forward
output_hidden_states=output_hidden_states,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 439, in forward
output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 371, in forward
hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 315, in forward
hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 240, in forward
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 11.17 GiB total capacity; 10.68 GiB already allocated; 95.31 MiB free; 10.77 GiB reserved in total by PyTorch)```
Please how do I resolve this
The out of memory error is likely caused by not cleaning up the session and or freeing up the GPU.
From the similar Github issue.
It is because of mini-batch of data does not fit on to GPU memory. Just decrease the batch size. When I set batch size = 256 for cifar10 dataset I got the same error; Then I set the batch size = 128, it is solved.

TypeError: Issue with GridSearchCV

I am trying to use GridSearchCV and Pipeline to check parameters of SVM. The code is as follows.
parameter={'svm_C':(0.1, 1, 10, 100), 'svm_gamma':(0.001, 0.01, 0.1, 10)}
pipe=Pipeline([("scaler", StandardScaler), ("svm", SVC())])
print(parameter)
print(pipe)
print(xtrain.shape)
print(ytrain1.shape)
grid=GridSearchCV(pipe, parameter, cv=3, n_jobs=-1)
grid.fit(xtrain,ytrain1)
print("Best set score:{}".format(grid.best_score_))
print("Test set Score:{}".format(grid.score(xtest,ytest1)))
print("Best paameters:{}".format(grid.best_params_))
filename='finalized_svm.sav'
filename1='gridfinal_svm.sav'
joblib.dump(best_estimator_, filename)
joblib.dump(grid, filename1)
pred=grid.predict(xtest)
confusion=confusion_matrix(ytest1,pred), print(confusion)
I loaded some .mat files for xtrain and ytrain. The issue is generated in line "grid.fit(xtrain,ytrain1)".
The error generated is as follows
Traceback (most recent call last):
File "C:/code net/mysvm.py", line 44, in <module>
grid.fit(xtrain,ytrain1)
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\model_selection\_search.py", line 626, in fit
base_estimator = clone(self.estimator)
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 62, in clone
new_object_params[name] = clone(param, safe=False)
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 50, in clone
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 50, in <listcomp>
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 50, in clone
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 50, in <listcomp>
return estimator_type([clone(e, safe=safe) for e in estimator])
File "C:\Users\Manisha\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\base.py", line 60, in clone
new_object_params = estimator.get_params(deep=False)
TypeError: get_params() missing 1 required positional argument: 'self'
By using parameter={'svm_C':(0.1, 1, 10, 100), 'svm_gamma':(0.001, 0.01, 0.1, 10)}
pipe=Pipeline([("scaler", StandardScaler()), the error comes out
Traceback (most recent call last):
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\externals\loky\process_executor.py", line 420, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 563, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 261, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 261, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\model_selection\_validation.py", line 514, in _fit_and_score
estimator.set_params(**parameters)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\pipeline.py", line 147, in set_params
self._set_params('steps', **kwargs)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\utils\metaestimators.py", line 52, in _set_params
super(_BaseComposition, self).set_params(**params)
File "C:\Users\Manisha\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\base.py", line 213, in set_params
(key, self))
ValueError: Invalid parameter svm_C for estimator Pipeline(memory=None,
steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]). Check the list of available parameters with `estimator.get_params().keys()`.
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-7-83184f586b10> in <module>
2 print(ytrain1.shape)
3 grid1=GridSearchCV(pipe, parameter, cv=3, n_jobs=-1)
----> 4 grid1.fit(xtrain,ytrain1)
5 print("Best set score:{}".format(grid.best_score_))
6 print("Test set Score:{}".format(grid.score(xtest,ytest1)))
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
994
995 with self._backend.retrieval_context():
--> 996 self.retrieve()
997 # Make sure that we get a last message telling us we are done
998 elapsed_time = time.time() - self._start_time
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\parallel.py in retrieve(self)
897 try:
898 if getattr(self._backend, 'supports_timeout', False):
--> 899 self._output.extend(job.get(timeout=self.timeout))
900 else:
901 self._output.extend(job.get())
~\AppData\Local\conda\conda\envs\tfp36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
515 AsyncResults.get from multiprocessing."""
516 try:
--> 517 return future.result(timeout=timeout)
518 except LokyTimeoutError:
519 raise TimeoutError()
~\AppData\Local\conda\conda\envs\tfp36\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
~\AppData\Local\conda\conda\envs\tfp36\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: Invalid parameter svm_C for estimator Pipeline(memory=None,
steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]). Check the list of available parameters with `estimator.get_params().keys()`.
I have done what you want to do in this way, and it is working:
# Setup the pipeline
steps = [('scaler', StandardScaler()),
('SVM', SVC())]
pipeline = Pipeline(steps)
# Specify the hyperparameter space
parameters = {'SVM__C':[1, 10, 100],
'SVM__gamma':[0.1, 0.01]}
I see 2 differences respect to your code:
When defining the steps in the pipeline instantination you write StandardScaler, without parenthesis:
Your code is:
pipe=Pipeline([("scaler", StandardScaler), ("svm", SVC())])
In my code I write StandardScaler(), look:
steps = [('scaler', StandardScaler()),('SVM', SVC())]
pipeline = Pipeline(steps)
The second difference is that when you specify the keys for your parameters grid parameters. You provide the keys: 'svm_C', 'svm_gamma' instead of 'SVM__C' and 'SVM__gamma' respectively.
You need double-under_square _+_ , this is, __ , as the 1st person that answered your question said before.
So, instead to write :
parameter={'svm_C':(0.1, 1, 10, 100), 'svm_gamma':(0.001, 0.01, 0.1, 10)}
You should write
parameters = {'SVM__C':[1, 10, 100],'SVM__gamma':[0.1, 0.01]}
I even have gotten an error writting 'svm__c' instead of 'SVM__C' and 'svm__gamma' instead of 'SVM__gamma'.
This last I do not why.
I hope, it helps

Resources