Error CUBLAS_STATUS_NOT_INITIALIZED with nn.Linear - pytorch

from torch import nn
from transformers.models.bert.modeling_bert import BertConfig
config = BertConfig.from_pretrained("bert-base-uncased")
self.x_head = nn.Linear(in_features=config.hidden_size, out_features=28)
word_embeddings = nn.Embedding( num_embeddings=979, embedding_dim=config.hidden_size ,padding_idx=0,)(ids_1).to(self.device)
vis_embeddings = nn.Embedding( num_embeddings=127, embedding_dim=config.hidden_size, padding_idx=0,)(ids_2).to(self.device)
input_embeds = torch.cat([word_embeddings , vis_embeddings] , dim =1).to(self.device)
in forward() method I have below code:
x_scores = self.x_head(input_embeds)
I am getting below error at x_scores(last line above)
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
166 return self.module(*inputs[0], **kwargs[0])
167 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 168 outputs = self.parallel_apply(replicas, inputs, kwargs)
169 return self.gather(outputs, self.output_device)
170
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
176
177 def parallel_apply(self, replicas, inputs, kwargs):
--> 178 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
179
180 def gather(self, outputs, output_device):
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
84 output = results[i]
85 if isinstance(output, ExceptionWrapper):
---> 86 output.reraise()
87 outputs.append(output)
88 return outputs
~/pytorch-env_py3.7/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
423 # have message field
424 raise self.exc_type(message=msg)
--> 425 raise self.exc_type(msg)
426
427
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "", line 56, in forward
x_scores = self.x_head(input_embeds)
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 96, in forward
return F.linear(input, self.weight, self.bias)
File "/home/gems/pytorch-env_py3.7/lib/python3.7/site-packages/torch/nn/functional.py", line 1847, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling cublasCreate(handle)

Related

How to run torchinfo on BertClassifier?

I want to run torchinfo on BertClassifier and can't do it without errors:
class BertClassifier(nn.Module):
def __init__(self, dropout=0.5):
super(BertClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, 3)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
vEmbeddingToken, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
torchinfo.summary(BertClassifier(), ((4, 512),(4, 1, 512)))
Getting error:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torchinfo/torchinfo.py:272, in forward_pass(model, x, batch_dim, cache_forward_pass, device, **kwargs)
271 if isinstance(x, (list, tuple)):
--> 272 _ = model.to(device)(*x, **kwargs)
273 elif isinstance(x, dict):
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torch/nn/modules/module.py:1110, in Module._call_impl(self, *input, **kwargs)
1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1109 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110 return forward_call(*input, **kwargs)
1111 # Do not call functions when jit is used
Input In [24], in BertClassifier.forward(self, input_id, mask)
12 def forward(self, input_id, mask):
13
14 #
15 # pooled_output - embedding vector of [CLS] token
16 #
---> 17 vEmbeddingToken, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
18 dropout_output = self.dropout(pooled_output)
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torch/nn/modules/module.py:1128, in Module._call_impl(self, *input, **kwargs)
1126 input = bw_hook.setup_input_hook(input)
-> 1128 result = forward_call(*input, **kwargs)
1129 if _global_forward_hooks or self._forward_hooks:
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py:1010, in BertModel.forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
1008 head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-> 1010 embedding_output = self.embeddings(
1011 input_ids=input_ids,
1012 position_ids=position_ids,
1013 token_type_ids=token_type_ids,
1014 inputs_embeds=inputs_embeds,
1015 past_key_values_length=past_key_values_length,
1016 )
1017 encoder_outputs = self.encoder(
1018 embedding_output,
1019 attention_mask=extended_attention_mask,
(...)
1027 return_dict=return_dict,
1028 )
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torch/nn/modules/module.py:1128, in Module._call_impl(self, *input, **kwargs)
1126 input = bw_hook.setup_input_hook(input)
-> 1128 result = forward_call(*input, **kwargs)
1129 if _global_forward_hooks or self._forward_hooks:
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py:235, in BertEmbeddings.forward(self, input_ids, token_type_ids, position_ids, inputs_embeds, past_key_values_length)
234 if inputs_embeds is None:
--> 235 inputs_embeds = self.word_embeddings(input_ids)
236 token_type_embeddings = self.token_type_embeddings(token_type_ids)
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torch/nn/modules/module.py:1128, in Module._call_impl(self, *input, **kwargs)
1126 input = bw_hook.setup_input_hook(input)
-> 1128 result = forward_call(*input, **kwargs)
1129 if _global_forward_hooks or self._forward_hooks:
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torch/nn/modules/sparse.py:158, in Embedding.forward(self, input)
157 def forward(self, input: Tensor) -> Tensor:
--> 158 return F.embedding(
159 input, self.weight, self.padding_idx, self.max_nm,
160 self.norm_type, self.scale_grad_by_freq, self.sparse)
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torch/nn/functional.py:2183, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2182 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2183 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last)
Input In [25], in <cell line: 1>()
----> 1 torchinfo.summary(BertClassifier(), ((4, 512),(4, 1, 512)))
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torchinfo/torchinfo.py:201, in summary(model, input_size, input_data, batch_dim, cache_forward_pass, col_names, col_width, depth, device, dtypes, row_settings, verbose, **kwargs)
196 validate_user_params(input_data, input_size, columns, col_width, verbose)
198 x, correct_input_size = process_input(
199 input_data, input_size, batch_dim, device, dtypes
200 )
--> 201 summary_list = forward_pass(
202 model, x, batch_dim, cache_forward_pass, device, **kwargs
203 )
204 formatting = FormattingOptions(depth, verbose, columns, col_width, rows)
205 results = ModelStatistics(
206 summary_list, correct_input_size, get_total_memory_used(x), formatting
207 )
File ~/abWrk/abWrkVenv/lib/python3.8/site-packages/torchinfo/torchinfo.py:281, in forward_pass(model, x, batch_dim, cache_forward_pass, device, **kwargs)
279 except Exception as e:
280 executed_layers = [layer for layer in summary_list if layer.executed]
--> 281 raise RuntimeError(
282 "Failed to run torchinfo. See above stack traces for more details. "
283 f"Executed layers up to: {executed_layers}"
284 ) from e
285 finally:
286 if hooks is not None:
RuntimeError: Failed to run torchinfo. See above stack traces for more details. Executed layers up to: []
How can I use torchinfo on BertClassifier ?

Why does ndcg_score result in nan values?

Consider the following code:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, ndcg_score, make_scorer
from sklearn.svm import SVC
X_data = pd.DataFrame(np.random.randint(0,1,size=(100, 4)), columns=list('ABCD'))
X_data = sp.csr_matrix(X_data.to_numpy())
Y_data = pd.DataFrame(np.random.choice([0,1,5], 100), columns=['Y'])
# Set the parameters by cross-validation
param_grid = {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]}
clf = GridSearchCV(SVC(), param_grid, scoring=ndcg_score, refit=True, verbose=3, n_jobs=-1, error_score='raise')
test = clf.fit(X_data, Y_data)
I am wondering why this would raise the following error:
Fitting 5 folds for each of 8 candidates, totalling 40 fits
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\externals\loky\process_executor.py", line 431, in _process_worker
r = call_item()
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\externals\loky\process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_validation.py", line 625, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
scores = scorer(estimator, X_test, y_test)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 74, in inner_f
return f(**kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\metrics\_ranking.py", line 1564, in ndcg_score
y_true = check_array(y_true, ensure_2d=False)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\test\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py", line 710, in check_array
array = array.astype(np.float64)
TypeError: float() argument must be a string or a number, not 'SVC'
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-45-93a8890b095c> in <module>
18
19 clf = GridSearchCV(SVC(), param_grid, scoring=ndcg_score, refit=True, verbose=3, n_jobs=-1, error_score='raise')
---> 20 test = clf.fit(X_data, Y_data)
21 #print(test.best_score_)
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1294 def _run_search(self, evaluate_candidates):
1295 """Search all candidates in param_grid"""
-> 1296 evaluate_candidates(ParameterGrid(self.param_grid))
1297
1298
~\Anaconda3\envs\kaggleSVM\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~\Anaconda3\envs\kaggleSVM\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~\Anaconda3\envs\kaggleSVM\lib\concurrent\futures\_base.py in result(self, timeout)
442 raise CancelledError()
443 elif self._state == FINISHED:
--> 444 return self.__get_result()
445 else:
446 raise TimeoutError()
~\Anaconda3\envs\kaggleSVM\lib\concurrent\futures\_base.py in __get_result(self)
387 if self._exception:
388 try:
--> 389 raise self._exception
390 finally:
391 # Break a reference cycle with the exception in self._exception
TypeError: float() argument must be a string or a number, not 'SVC'
I am not quite sure why this would result in a TypeError.
I cannot recreate the error you are reporting, but using error_score="raise" and n_jobs=1 (not strictly necessary, but the output is a little easier to read), and wrapping ndcg_score with make_scorer with needs_proba=True, I get this one:
Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got multiclass instead
which supports my first comment: NDCG assumes multilabel format. That suggests you need to understand whether NDCG is really appropriate for your task, and if so either turn your problem into a multilabel one or write a custom scorer that converts the multiclass output into a multilabel (one-hot encoded) one before computing the score.

Hugging face - RuntimeError: Caught RuntimeError in replica 0 on device 0 on Azure Databricks

How do I run the run_language_modeling.py script from hugging face using the pretrained roberta case model to fine-tune using my own data on the Azure databricks with a GPU cluster.
Using Transformer version 2.9.1 and 3.0 .
Python 3.6
Torch `1.5.0
torchvision 0.6
This is the script I ran below on Azure databricks
%run '/dbfs/FileStore/tables/dev/run_language_modeling.py' \
--output_dir='/dbfs/FileStore/tables/final_train/models/roberta_base_reduce_n' \
--model_type=roberta \
--model_name_or_path=roberta-base \
--do_train \
--num_train_epochs 5 \
--train_data_file='/dbfs/FileStore/tables/final_train/train_data/all_data_desc_list_full.txt' \
--mlm
This is the error I get after running the above command.
/dbfs/FileStore/tables/dev/run_language_modeling.py in <module>
279
280 if __name__ == "__main__":
--> 281 main()
/dbfs/FileStore/tables/dev/run_language_modeling.py in main()
243 else None
244 )
--> 245 trainer.train(model_path=model_path)
246 trainer.save_model()
247 # For convenience, we also re-save the tokenizer to the same directory,
/databricks/python/lib/python3.7/site-packages/transformers/trainer.py in train(self, model_path)
497 continue
498
--> 499 tr_loss += self._training_step(model, inputs, optimizer)
500
501 if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
/databricks/python/lib/python3.7/site-packages/transformers/trainer.py in _training_step(self, model, inputs, optimizer)
620 inputs["mems"] = self._past
621
--> 622 outputs = model(**inputs)
623 loss = outputs[0] # model outputs are always tuple in transformers (see doc)
624
/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
548 result = self._slow_forward(*input, **kwargs)
549 else:
--> 550 result = self.forward(*input, **kwargs)
551 for hook in self._forward_hooks.values():
552 hook_result = hook(self, input, result)
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
153 return self.module(*inputs[0], **kwargs[0])
154 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 155 outputs = self.parallel_apply(replicas, inputs, kwargs)
156 return self.gather(outputs, self.output_device)
157
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
163
164 def parallel_apply(self, replicas, inputs, kwargs):
--> 165 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
166
167 def gather(self, outputs, output_device):
/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
83 output = results[i]
84 if isinstance(output, ExceptionWrapper):
---> 85 output.reraise()
86 outputs.append(output)
87 return outputs
/databricks/python/lib/python3.7/site-packages/torch/_utils.py in reraise(self)
393 # (https://bugs.python.org/issue2651), so we work around it.
394 msg = KeyErrorMessage(msg)
--> 395 raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/databricks/python/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_roberta.py", line 239, in forward
output_hidden_states=output_hidden_states,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 762, in forward
output_hidden_states=output_hidden_states,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 439, in forward
output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 371, in forward
hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 315, in forward
hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
File "/databricks/python/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/transformers/modeling_bert.py", line 240, in forward
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 11.17 GiB total capacity; 10.68 GiB already allocated; 95.31 MiB free; 10.77 GiB reserved in total by PyTorch)```
Please how do I resolve this
The out of memory error is likely caused by not cleaning up the session and or freeing up the GPU.
From the similar Github issue.
It is because of mini-batch of data does not fit on to GPU memory. Just decrease the batch size. When I set batch size = 256 for cifar10 dataset I got the same error; Then I set the batch size = 128, it is solved.

Getting AttributeError while training DNN classifer

Here is the code:
continous=['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
creating feat_cols
I have automated the process using for loop
feat_cols = []
for col in census.columns:
## Continous - Unchanged
if col in continous:
feat_cols.append(tf.feature_column.numeric_column(col))
## Categorical - Trick, no need to count
elif col!='income_bracket':
print('Embedded {}'.format(col))
feat_cols.append(tf.feature_column.embedding_column(categorical_column=col,dimension=X_train[col].nunique()))
Creating model
Importing Tensorflow I've created this model.
dnnmodel = tf.estimator.DNNClassifier(hidden_units=[7,7,7], feature_columns=feat_cols, n_classes=2)
ip_dnn = tf.estimator.inputs.pandas_input_fn(X_train, y_train, num_epochs=None,shuffle=True)
dnnmodel.train(input_fn=ip_dnn, steps=5000)
Error:
Earlier with LinearClassifier everything worked fine.
INFO:tensorflow:Calling model_fn.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-67-751f0a94d0d2> in <module>()
1 # ip_func created earliar
2
----> 3 dnnmodel.train(input_fn=ip_dnn, steps=5000)
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in train(self, input_fn, hooks, steps, max_steps, saving_listeners)
356
357 saving_listeners = _check_listeners_type(saving_listeners)
--> 358 loss = self._train_model(input_fn, hooks, saving_listeners)
359 logging.info('Loss for final step: %s.', loss)
360 return self
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _train_model(self, input_fn, hooks, saving_listeners)
1122 return self._train_model_distributed(input_fn, hooks, saving_listeners)
1123 else:
-> 1124 return self._train_model_default(input_fn, hooks, saving_listeners)
1125
1126 def _train_model_default(self, input_fn, hooks, saving_listeners):
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _train_model_default(self, input_fn, hooks, saving_listeners)
1152 worker_hooks.extend(input_hooks)
1153 estimator_spec = self._call_model_fn(
-> 1154 features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
1155 global_step_tensor = training_util.get_global_step(g)
1156 return self._train_with_estimator_spec(estimator_spec, worker_hooks,
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py in _call_model_fn(self, features, labels, mode, config)
1110
1111 logging.info('Calling model_fn.')
-> 1112 model_fn_results = self._model_fn(features=features, **kwargs)
1113 logging.info('Done calling model_fn.')
1114
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in _model_fn(features, labels, mode, config)
520 input_layer_partitioner=input_layer_partitioner,
521 config=config,
--> 522 batch_norm=batch_norm)
523
524 super(DNNClassifier, self).__init__(
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in _dnn_model_fn(features, labels, mode, head, hidden_units, feature_columns, optimizer, activation_fn, dropout, input_layer_partitioner, config, use_tpu, batch_norm)
285 input_layer_partitioner=input_layer_partitioner,
286 batch_norm=batch_norm)
--> 287 logits = logit_fn(features=features, mode=mode)
288
289 if use_tpu:
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in dnn_logit_fn(features, mode)
101 batch_norm,
102 name='dnn')
--> 103 return dnn_model(features, mode)
104
105 return dnn_logit_fn
~\Anaconda3\lib\site-packages\tensorflow\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
552 # In graph mode, failure to build the layer's graph
553 # implies a user-side bug. We don't catch exceptions.
--> 554 outputs = self.call(inputs, *args, **kwargs)
555 else:
556 try:
~\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py in call(self, features, mode)
193 'input_from_feature_columns',
194 partitioner=self._input_layer_partitioner):
--> 195 net = self._input_layer(features)
196 for i in range(len(self._hidden_layers)):
197 net = self._hidden_layers[i](net)
~\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in __call__(self, features)
335 trainable=self._trainable,
336 cols_to_vars=None,
--> 337 from_template=True)
338
339 #property
~\Anaconda3\lib\site-packages\tensorflow\python\ops\template.py in __call__(self, *args, **kwargs)
366 custom_getter=self._custom_getter) as vs:
367 self._variable_scope = vs
--> 368 return self._call_func(args, kwargs)
369
370 #property
~\Anaconda3\lib\site-packages\tensorflow\python\ops\template.py in _call_func(self, args, kwargs)
309 # Checkpointable).
310 with checkpointable_util.capture_dependencies(template=self):
--> 311 result = self._func(*args, **kwargs)
312
313 if self._variables_created:
~\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in _internal_input_layer(features, feature_columns, weight_collections, trainable, cols_to_vars, scope, cols_to_output_tensors, from_template)
179 """See input_layer. `scope` is a name or variable scope to use."""
180
--> 181 feature_columns = _normalize_feature_columns(feature_columns)
182 for column in feature_columns:
183 if not isinstance(column, _DenseColumn):
~\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py in _normalize_feature_columns(feature_columns)
2266 name_to_column = dict()
2267 for column in feature_columns:
-> 2268 if column.name in name_to_column:
2269 raise ValueError('Duplicate feature column name found for columns: {} '
2270 'and {}. This usually means that these columns refer to '
~\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column_v2.py in name(self)
2960 def name(self):
2961 """See `FeatureColumn` base class."""
-> 2962 return '{}_embedding'.format(self.categorical_column.name)
2963
2964 #property
**AttributeError: 'str' object has no attribute 'name'**
originally defined at:
File "C:\Users\Subham\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py", line 102, in dnn_logit_fn
name='dnn')
File "C:\Users\Subham\Anaconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\dnn.py", line 134, in __init__
create_scope_now=False)
File "C:\Users\Subham\Anaconda3\lib\site-packages\tensorflow\python\feature_column\feature_column.py", line 327, in __init__
self._name, _internal_input_layer, create_scope_now_=create_scope_now)
File "C:\Users\Subham\Anaconda3\lib\site-packages\tensorflow\python\ops\template.py", line 154, in make_template
**kwargs)

Dask failing with/due to Tornado error 'too many files open'

I am running Jupyter notebook launched from Anaconda. When trying to initialize a distributed Dask environment the following Tornado package error is thrown:
tornado.application - ERROR - Multiple exceptions in yield list
Traceback (most recent call last):
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 883, in callback
result_list.append(f.result())
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 1141, in run
yielded = self.gen.throw(*exc_info)
File "/anaconda3/lib/python3.7/site-packages/distributed/deploy/local.py", line 208, in _start_worker
yield w._start()
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 1133, in run
value = future.result()
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 1141, in run
yielded = self.gen.throw(*exc_info)
File "/anaconda3/lib/python3.7/site-packages/distributed/nanny.py", line 157, in _start
response = yield self.instantiate()
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 1133, in run
value = future.result()
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 1141, in run
yielded = self.gen.throw(*exc_info)
File "/anaconda3/lib/python3.7/site-packages/distributed/nanny.py", line 226, in instantiate
self.process.start()
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 1133, in run
value = future.result()
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/anaconda3/lib/python3.7/site-packages/distributed/nanny.py", line 351, in start
self.init_result_q = init_q = mp_context.Queue()
File "/anaconda3/lib/python3.7/multiprocessing/context.py", line 102, in Queue
return Queue(maxsize, ctx=self.get_context())
File "/anaconda3/lib/python3.7/multiprocessing/queues.py", line 41, in __init__
self._reader, self._writer = connection.Pipe(duplex=False)
File "/anaconda3/lib/python3.7/multiprocessing/connection.py", line 517, in Pipe
fd1, fd2 = os.pipe()
OSError: [Errno 24] Too many open files
tornado.application - ERROR - Multiple exceptions in yield list
Traceback (most recent call last):
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 883, in callback
result_list.append(f.result())
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 1141, in run
yielded = self.gen.throw(*exc_info)
File "/anaconda3/lib/python3.7/site-packages/distributed/deploy/local.py", line 208, in _start_worker
yield w._start()
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 1133, in run
value = future.result()
File "/anaconda3/lib/python3.7/site-packages/tornado/gen.py", line 326, in wrapper
yielded = next(result)
File "/anaconda3/lib/python3.7/site-packages/distributed/nanny.py", line 143, in _start
listen_args=self.listen_args)
File "/anaconda3/lib/python3.7/site-packages/distributed/core.py", line 272, in listen
self.listener.start()
File "/anaconda3/lib/python3.7/site-packages/distributed/comm/tcp.py", line 396, in start
backlog=backlog)
File "/anaconda3/lib/python3.7/site-packages/tornado/netutil.py", line 134, in bind_sockets
sock = socket.socket(af, socktype, proto)
File "/anaconda3/lib/python3.7/socket.py", line 151, in __init__
_socket.socket.__init__(self, family, type, proto, fileno)
OSError: [Errno 24] Too many open files
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<timed exec> in <module>
/anaconda3/lib/python3.7/site-packages/distributed/client.py in __init__(self, address, loop, timeout, set_as_default, scheduler_file, security, asynchronous, name, heartbeat_interval, serializers, deserializers, extensions, direct_to_workers, **kwargs)
634 ext(self)
635
--> 636 self.start(timeout=timeout)
637
638 from distributed.recreate_exceptions import ReplayExceptionClient
/anaconda3/lib/python3.7/site-packages/distributed/client.py in start(self, **kwargs)
757 self._started = self._start(**kwargs)
758 else:
--> 759 sync(self.loop, self._start, **kwargs)
760
761 def __await__(self):
/anaconda3/lib/python3.7/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
/anaconda3/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
/anaconda3/lib/python3.7/site-packages/distributed/utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
/anaconda3/lib/python3.7/site-packages/distributed/client.py in _start(self, timeout, **kwargs)
820 self.cluster = LocalCluster(loop=self.loop, asynchronous=True,
821 **self._startup_kwargs)
--> 822 yield self.cluster
823 except (OSError, socket.error) as e:
824 if e.errno != errno.EADDRINUSE:
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
/anaconda3/lib/python3.7/asyncio/tasks.py in _wrap_awaitable(awaitable)
601 that will later be wrapped in a Task by ensure_future().
602 """
--> 603 return (yield from awaitable.__await__())
604
605
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
/anaconda3/lib/python3.7/site-packages/distributed/deploy/local.py in _start(self, ip, n_workers)
189 self.scheduler.start(scheduler_address)
190
--> 191 yield [self._start_worker(**self.worker_kwargs) for i in range(n_workers)]
192
193 self.status = 'running'
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in callback(f)
881 for f in children:
882 try:
--> 883 result_list.append(f.result())
884 except Exception as e:
885 if future.done():
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
/anaconda3/lib/python3.7/site-packages/distributed/deploy/local.py in _start_worker(self, death_timeout, **kwargs)
206 death_timeout=death_timeout,
207 silence_logs=self.silence_logs, **kwargs)
--> 208 yield w._start()
209
210 self.workers.append(w)
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
/anaconda3/lib/python3.7/site-packages/distributed/nanny.py in _start(self, addr_or_port)
155
156 logger.info(' Start Nanny at: %r', self.address)
--> 157 response = yield self.instantiate()
158 if response == 'running':
159 assert self.worker_address
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
/anaconda3/lib/python3.7/site-packages/distributed/nanny.py in instantiate(self, comm)
224 result = yield gen.with_timeout(
225 timedelta(seconds=self.death_timeout),
--> 226 self.process.start()
227 )
228 except gen.TimeoutError:
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
/anaconda3/lib/python3.7/site-packages/tornado/gen.py in wrapper(*args, **kwargs)
324 try:
325 orig_stack_contexts = stack_context._state.contexts
--> 326 yielded = next(result)
327 if stack_context._state.contexts is not orig_stack_contexts:
328 yielded = _create_future()
/anaconda3/lib/python3.7/site-packages/distributed/nanny.py in start(self)
350
351 self.init_result_q = init_q = mp_context.Queue()
--> 352 self.child_stop_q = mp_context.Queue()
353 uid = uuid.uuid4().hex
354
/anaconda3/lib/python3.7/multiprocessing/context.py in Queue(self, maxsize)
100 '''Returns a queue object'''
101 from .queues import Queue
--> 102 return Queue(maxsize, ctx=self.get_context())
103
104 def JoinableQueue(self, maxsize=0):
/anaconda3/lib/python3.7/multiprocessing/queues.py in __init__(self, maxsize, ctx)
39 from .synchronize import SEM_VALUE_MAX as maxsize
40 self._maxsize = maxsize
---> 41 self._reader, self._writer = connection.Pipe(duplex=False)
42 self._rlock = ctx.Lock()
43 self._opid = os.getpid()
/anaconda3/lib/python3.7/multiprocessing/connection.py in Pipe(duplex)
515 c2 = Connection(s2.detach())
516 else:
--> 517 fd1, fd2 = os.pipe()
518 c1 = Connection(fd1, writable=False)
519 c2 = Connection(fd2, readable=False)
OSError: [Errno 24] Too many open files
The problem seems to be with 'Tornado' as alluded to [here][https://github.com/dask/distributed/issues/1941]. My version of Anaconda has Tornado 5.1.1 with Python 3.7.3 and Dask 1.25.1
This is the code that is being run:
%%time
import pandas as pd
import dask.dataframe as dd
import dask.distributed as dist
client = dist.Client()
Several weeks ago I was able to run some small distributed Dask examples and I can run Dask successfully without invoking a Client. If the problem is Tornado is there a workaround?
In case anyone was wondering about MacOS process file limits -
I found the same issue with the master branch yesterday (5/20/2019) and found this: https://github.com/dask/distributed/issues/733. For me, I simply looked at the underlying script for dask_scheduler and replicated it in Pycharm:
from distributed.cli.dask_scheduler import go
if __name__ == '__main__':
go()
Starts up, is stable and I've attached a worker to it from the command line.
I found a workaround. On MacOS it looks like using command line to change file limits is only good for launches from the terminal. Furthermore if you restart, the limits are reset to original values (256 was default on my machine). To set the limits properly you have to create a file 'limit.maxfiles.plist' in /etc/Library/LaunchDaemons and restart. I got this from here. This works around the 'too many files' error but likely only postpones the problem with Tornado.

Resources