Could not set Native Handle Error Tensorflow - python-3.x

I'm in the middle of training my net on tensorflow and I get this error at some part of the training process, though I can not tell exactly what is causing it because it will always happen at a seemingly random training sample at a more or less random epoch of training. I'm at my wits end with this and would like it if someone could shed light on it below.
Traceback (most recent call last):
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1361, in _do_call
return fn(*args)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1340, in _run_fn
target_list, status, run_metadata)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/framework/errors_impl.py", line 516, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.AbortedError: Operation received an exception:Status: 3, message: could not set native handle, in file tensorflow/core/kernels/mkl_conv_ops.cc:652
[[Node: SRGAN_g/Conv4_4/Conv2D = _MklConv2D[T=DT_FLOAT, _kernel="MklOp", data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](SRGAN_g/pixelshufflerx2/1/4/Relu, SRGAN_g/Conv4/W_conv2d/read, SRGAN_g/pixelshufflerx2/1/4/Relu:1, DMT/_132)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 289, in <module>
train()
File "main.py", line 149, in train
errM, _ = sess.run([mse_loss, g_optim_init], {t_image: b_imgs_96, t_target_image: b_imgs_384})
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 905, in run
run_metadata_ptr)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1137, in _run
feed_dict_tensor, options, run_metadata)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1355, in _do_run
options, run_metadata)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py", line 1374, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.AbortedError: Operation received an exception:Status: 3, message: could not set native handle, in file tensorflow/core/kernels/mkl_conv_ops.cc:652
[[Node: SRGAN_g/Conv4_4/Conv2D = _MklConv2D[T=DT_FLOAT, _kernel="MklOp", data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](SRGAN_g/pixelshufflerx2/1/4/Relu, SRGAN_g/Conv4/W_conv2d/read, SRGAN_g/pixelshufflerx2/1/4/Relu:1, DMT/_132)]]
Caused by op 'SRGAN_g/Conv4_4/Conv2D', defined at:
File "main.py", line 289, in <module>
train()
File "main.py", line 53, in train
net_g = SRGAN_g(t_image, is_train=True, reuse=False)
File "/global/home/hpc4142/srgan-master_TF/model.py", line 51, in SRGAN_g
recursive_layers[i] = Conv2dReuse(recursive_layers[i], 256, (3, 3), (1, 1), act=None, padding='SAME', W_init=w_init, name='n256s1/2/%s' %i, group = "Conv4")
File "/global/home/hpc4142/srgan-master_TF/nu_layers.py", line 461, in Conv2dReuse
group = group)
File "/global/home/hpc4142/srgan-master_TF/nu_layers.py", line 270, in __init__
self.outputs = act( tf.nn.conv2d(self.inputs, W, strides=strides, padding=padding, use_cudnn_on_gpu=use_cudnn_on_gpu, data_format=data_format) + b )
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 631, in conv2d
data_format=data_format, dilations=dilations, name=name)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3271, in create_op
op_def=op_def)
File "/global/home/hpc4142/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1650, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
AbortedError (see above for traceback): Operation received an exception:Status: 3, message: could not set native handle, in file tensorflow/core/kernels/mkl_conv_ops.cc:652
[[Node: SRGAN_g/Conv4_4/Conv2D = _MklConv2D[T=DT_FLOAT, _kernel="MklOp", data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](SRGAN_g/pixelshufflerx2/1/4/Relu, SRGAN_g/Conv4/W_conv2d/read, SRGAN_g/pixelshufflerx2/1/4/Relu:1, DMT/_132)]]

Related

Error using RandomizedSearchCV on a LSTM model

I want to find the optimal number of neurons and layers for a LSTM model and I have created the following:
def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=[20,11]):
model = keras.models.Sequential()
options = {"input_shape": input_shape}
for layer in range(n_hidden):
model.add(keras.layers.Dense(n_neurons, activation="swish", **options))
options = {}
model.add(keras.layers.Dense(10, **options))
#optimizer = keras.optimizers.SGD(learning_rate)
model.compile(loss="mse", optimizer='adam')
return model
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV
param_distribs = {
"n_hidden": [0, 1, 2, 3],
"n_neurons": np.arange(1, 100),
#"learning_rate": reciprocal(3e-4, 3e-2),
}
rnd_search_cv = RandomizedSearchCV(keras_reg, param_distribs, n_iter=10)
rnd_search_cv.fit(x_tr, y_tr , epochs=100,
callbacks=[keras.callbacks.EarlyStopping(patience=10)])
The dataset used is composed of 11 features and in this model it looks for the 20-in previous times steps and aims to predict the following 10 time steps. When I run the following code I have the following error:
> /home/use/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning:
50 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
> --------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
File "/home/use/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/wrappers/scikit_learn.py", line 175, in fit
history = self.model.fit(x, y, **fit_args)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/home/use/anaconda3/lib/python3.9/site-packages/tensorflow/python/eager/execute.py", line 52, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.InvalidArgumentError: Graph execution error:
Detected at node 'gradient_tape/mean_squared_error/BroadcastGradientArgs' defined at (most recent call last):
File "/home/use/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/use/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/use/anaconda3/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
app.launch_new_instance()
File "/home/use/anaconda3/lib/python3.9/site-packages/traitlets/config/application.py", line 846, in launch_instance
app.start()
File "/home/use/anaconda3/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 712, in start
self.io_loop.start()
File "/home/use/anaconda3/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 199, in start
self.asyncio_loop.run_forever()
File "/home/use/anaconda3/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
self._run_once()
File "/home/use/anaconda3/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
handle._run()
File "/home/use/anaconda3/lib/python3.9/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/home/use/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
await self.process_one()
File "/home/use/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 499, in process_one
await dispatch(*args)
File "/home/use/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
await result
File "/home/use/anaconda3/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
reply_content = await reply_content
File "/home/use/anaconda3/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 390, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/home/use/anaconda3/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
return super().run_cell(*args, **kwargs)
File "/home/use/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2914, in run_cell
result = self._run_cell(
File "/home/use/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2960, in _run_cell
return runner(coro)
File "/home/use/anaconda3/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
coro.send(None)
File "/home/use/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3185, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "/home/use/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3377, in run_ast_nodes
if (await self.run_code(code, result, async_=asy)):
File "/home/use/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3457, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "/tmp/ipykernel_10055/1251208469.py", line 9, in <module>
rnd_search_cv.fit(x_tr, y_tr , epochs=100,
File "/home/use/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_search.py", line 891, in fit
self._run_search(evaluate_candidates)
File "/home/use/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_search.py", line 1766, in _run_search
evaluate_candidates(
File "/home/use/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_search.py", line 838, in evaluate_candidates
out = parallel(
File "/home/use/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
if self.dispatch_one_batch(iterator):
File "/home/use/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
self._dispatch(tasks)
File "/home/use/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 779, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/home/use/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/home/use/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/home/use/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/home/use/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "/home/use/anaconda3/lib/python3.9/site-packages/sklearn/utils/fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "/home/use/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/wrappers/scikit_learn.py", line 175, in fit
history = self.model.fit(x, y, **fit_args)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
return fn(*args, **kwargs)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1650, in fit
tmp_logs = self.train_function(iterator)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in train_function
return step_function(self, iterator)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1233, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1222, in run_step
outputs = model.train_step(data)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1027, in train_step
self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 526, in minimize
grads_and_vars = self.compute_gradients(loss, var_list, tape)
File "/home/use/anaconda3/lib/python3.9/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 259, in compute_gradients
grads = tape.gradient(loss, var_list)
Node: 'gradient_tape/mean_squared_error/BroadcastGradientArgs'
Incompatible shapes: [32,20,10] vs. [32,10]
[[{{node gradient_tape/mean_squared_error/BroadcastGradientArgs}}]] [Op:__inference_train_function_1033752]

self._traceback = tf_stack.extract_stack()

I am training a custom ssd_mobilenet_v2_quantized_300x300 TensorFlow model for object detection using Google Colab with the downgraded version of TensorFlow 1.15.2 because I use to train my model on previous version of TensorFlow i.e. 1.14.0 but due to the latest update to version 2.2.0, I get the strange errors and therefore I can't use the latest version.
Using 1.15.2 version and selection even batch size of 8 I successfully starts the training process but after some time, the training process stops with the following errors.
TypeError: 'numpy.float64' object cannot be interpreted as an integer
self._traceback = tf_stack.extract_stack()
My complete training log is as follows;
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/evaluation.py", line 272, in _evaluate_once
session.run(eval_ops, feed_dict)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 754, in run
run_metadata=run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1259, in run
run_metadata=run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1360, in run
raise six.reraise(*original_exc_info)
File "/usr/local/lib/python3.6/dist-packages/six.py", line 693, in reraise
raise value
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1345, in run
return self._sess.run(*args, **kwargs)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1418, in run
run_metadata=run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1176, in run
return self._sess.run(*args, **kwargs)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 956, in run
run_metadata_ptr)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1180, in _run
feed_dict_tensor, options, run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1359, in _do_run
run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1384, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.OutOfRangeError: 2 root error(s) found.
(0) Out of range: End of sequence
[[node IteratorGetNext (defined at tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py:1748) ]]
(1) Out of range: End of sequence
[[node IteratorGetNext (defined at tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py:1748) ]]
[[Postprocessor/BatchMultiClassNonMaxSuppression/MultiClassNonMaxSuppression/non_max_suppression_with_scores_1/NonMaxSuppressionV5/_4683]]
0 successful operations.
0 derived errors ignored.
Original stack trace for 'IteratorGetNext':
File "content/models/research/object_detection/model_main.py", line 114, in <module>
tf.app.run()
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "usr/local/lib/python3.6/dist-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "usr/local/lib/python3.6/dist-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "content/models/research/object_detection/model_main.py", line 110, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 473, in train_and_evaluate
return executor.run()
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 613, in run
return self.run_local()
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 714, in run_local
saving_listeners=saving_listeners)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1195, in _train_model_default
saving_listeners)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1494, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 754, in run
run_metadata=run_metadata)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1259, in run
run_metadata=run_metadata)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1345, in run
return self._sess.run(*args, **kwargs)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1426, in run
run_metadata=run_metadata))
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/basic_session_run_hooks.py", line 594, in after_run
if self._save(run_context.session, global_step):
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/basic_session_run_hooks.py", line 619, in _save
if l.after_save(session, step):
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 519, in after_save
self._evaluate(global_step_value) # updates self.eval_result
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 539, in _evaluate
self._evaluator.evaluate_and_export())
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 920, in evaluate_and_export
hooks=self._eval_spec.hooks)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 480, in evaluate
name=name)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 522, in _actual_eval
return _evaluate()
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 504, in _evaluate
self._evaluate_build_graph(input_fn, hooks, checkpoint_path))
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1511, in _evaluate_build_graph
self._call_model_fn_eval(input_fn, self.config))
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1544, in _call_model_fn_eval
input_fn, ModeKeys.EVAL)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1025, in _get_features_and_labels_from_input_fn
self._call_input_fn(input_fn, mode))
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/util.py", line 65, in parse_input_fn_result
result = iterator.get_next()
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/data/ops/iterator_ops.py", line 426, in get_next
name=name)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/gen_dataset_ops.py", line 2518, in iterator_get_next
output_shapes=output_shapes, name=name)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py", line 3357, in create_op
attrs, op_def, compute_device)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
op_def=op_def)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py", line 1748, in __init__
self._traceback = tf_stack.extract_stack()
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1365, in _do_call
return fn(*args)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1350, in _run_fn
target_list, run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: TypeError: object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/numpy/core/function_base.py", line 117, in linspace
num = operator.index(num)
TypeError: 'numpy.float64' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/script_ops.py", line 235, in __call__
ret = func(*args)
File "/content/models/research/object_detection/metrics/coco_evaluation.py", line 416, in first_value_func
self._metrics = self.evaluate()
File "/content/models/research/object_detection/metrics/coco_evaluation.py", line 247, in evaluate
coco_wrapped_groundtruth, coco_wrapped_detections, agnostic_mode=False)
File "/content/models/research/object_detection/metrics/coco_tools.py", line 178, in __init__
cocoeval.COCOeval.__init__(self, groundtruth, detections, iouType=iou_type)
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 76, in __init__
self.params = Params(iouType=iouType) # parameters
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 527, in __init__
self.setDetParams()
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 507, in setDetParams
self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
File "<__array_function__ internals>", line 6, in linspace
File "/usr/local/lib/python3.6/dist-packages/numpy/core/function_base.py", line 121, in linspace
.format(type(num)))
TypeError: object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
[[{{node PyFunc_3}}]]
[[cond/Detections_Left_Groundtruth_Right/0/_4927]]
(1) Invalid argument: TypeError: object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/numpy/core/function_base.py", line 117, in linspace
num = operator.index(num)
TypeError: 'numpy.float64' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/script_ops.py", line 235, in __call__
ret = func(*args)
File "/content/models/research/object_detection/metrics/coco_evaluation.py", line 416, in first_value_func
self._metrics = self.evaluate()
File "/content/models/research/object_detection/metrics/coco_evaluation.py", line 247, in evaluate
coco_wrapped_groundtruth, coco_wrapped_detections, agnostic_mode=False)
File "/content/models/research/object_detection/metrics/coco_tools.py", line 178, in __init__
cocoeval.COCOeval.__init__(self, groundtruth, detections, iouType=iou_type)
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 76, in __init__
self.params = Params(iouType=iouType) # parameters
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 527, in __init__
self.setDetParams()
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 507, in setDetParams
self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
File "<__array_function__ internals>", line 6, in linspace
File "/usr/local/lib/python3.6/dist-packages/numpy/core/function_base.py", line 121, in linspace
.format(type(num)))
TypeError: object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
[[{{node PyFunc_3}}]]
0 successful operations.
0 derived errors ignored.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/content/models/research/object_detection/model_main.py", line 114, in <module>
tf.app.run()
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "/content/models/research/object_detection/model_main.py", line 110, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 473, in train_and_evaluate
return executor.run()
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 613, in run
return self.run_local()
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 714, in run_local
saving_listeners=saving_listeners)
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1195, in _train_model_default
saving_listeners)
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1494, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 754, in run
run_metadata=run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1259, in run
run_metadata=run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1360, in run
raise six.reraise(*original_exc_info)
File "/usr/local/lib/python3.6/dist-packages/six.py", line 693, in reraise
raise value
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1345, in run
return self._sess.run(*args, **kwargs)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1426, in run
run_metadata=run_metadata))
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/basic_session_run_hooks.py", line 594, in after_run
if self._save(run_context.session, global_step):
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/basic_session_run_hooks.py", line 619, in _save
if l.after_save(session, step):
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 519, in after_save
self._evaluate(global_step_value) # updates self.eval_result
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 539, in _evaluate
self._evaluator.evaluate_and_export())
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 920, in evaluate_and_export
hooks=self._eval_spec.hooks)
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 480, in evaluate
name=name)
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 522, in _actual_eval
return _evaluate()
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 511, in _evaluate
output_dir=self.eval_dir(name))
File "/tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1619, in _evaluate_run
config=self._session_config)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/evaluation.py", line 272, in _evaluate_once
session.run(eval_ops, feed_dict)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 861, in __exit__
self._close_internal(exception_type)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 894, in _close_internal
h.end(self._coordinated_creator.tf_sess)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/training/basic_session_run_hooks.py", line 951, in end
self._final_ops, feed_dict=self._final_ops_feed_dict)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 956, in run
run_metadata_ptr)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1180, in _run
feed_dict_tensor, options, run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1359, in _do_run
run_metadata)
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/client/session.py", line 1384, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: 2 root error(s) found.
(0) Invalid argument: TypeError: object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/numpy/core/function_base.py", line 117, in linspace
num = operator.index(num)
TypeError: 'numpy.float64' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/script_ops.py", line 235, in __call__
ret = func(*args)
File "/content/models/research/object_detection/metrics/coco_evaluation.py", line 416, in first_value_func
self._metrics = self.evaluate()
File "/content/models/research/object_detection/metrics/coco_evaluation.py", line 247, in evaluate
coco_wrapped_groundtruth, coco_wrapped_detections, agnostic_mode=False)
File "/content/models/research/object_detection/metrics/coco_tools.py", line 178, in __init__
cocoeval.COCOeval.__init__(self, groundtruth, detections, iouType=iou_type)
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 76, in __init__
self.params = Params(iouType=iouType) # parameters
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 527, in __init__
self.setDetParams()
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 507, in setDetParams
self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
File "<__array_function__ internals>", line 6, in linspace
File "/usr/local/lib/python3.6/dist-packages/numpy/core/function_base.py", line 121, in linspace
.format(type(num)))
TypeError: object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
[[node PyFunc_3 (defined at tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py:1748) ]]
[[cond/Detections_Left_Groundtruth_Right/0/_4927]]
(1) Invalid argument: TypeError: object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/numpy/core/function_base.py", line 117, in linspace
num = operator.index(num)
TypeError: 'numpy.float64' object cannot be interpreted as an integer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/script_ops.py", line 235, in __call__
ret = func(*args)
File "/content/models/research/object_detection/metrics/coco_evaluation.py", line 416, in first_value_func
self._metrics = self.evaluate()
File "/content/models/research/object_detection/metrics/coco_evaluation.py", line 247, in evaluate
coco_wrapped_groundtruth, coco_wrapped_detections, agnostic_mode=False)
File "/content/models/research/object_detection/metrics/coco_tools.py", line 178, in __init__
cocoeval.COCOeval.__init__(self, groundtruth, detections, iouType=iou_type)
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 76, in __init__
self.params = Params(iouType=iouType) # parameters
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 527, in __init__
self.setDetParams()
File "/usr/local/lib/python3.6/dist-packages/pycocotools/cocoeval.py", line 507, in setDetParams
self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
File "<__array_function__ internals>", line 6, in linspace
File "/usr/local/lib/python3.6/dist-packages/numpy/core/function_base.py", line 121, in linspace
.format(type(num)))
TypeError: object of type <class 'numpy.float64'> cannot be safely interpreted as an integer.
[[node PyFunc_3 (defined at tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py:1748) ]]
0 successful operations.
0 derived errors ignored.
Original stack trace for 'PyFunc_3':
File "content/models/research/object_detection/model_main.py", line 114, in <module>
tf.app.run()
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "usr/local/lib/python3.6/dist-packages/absl/app.py", line 299, in run
_run_main(main, args)
File "usr/local/lib/python3.6/dist-packages/absl/app.py", line 250, in _run_main
sys.exit(main(argv))
File "content/models/research/object_detection/model_main.py", line 110, in main
tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 473, in train_and_evaluate
return executor.run()
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 613, in run
return self.run_local()
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 714, in run_local
saving_listeners=saving_listeners)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1195, in _train_model_default
saving_listeners)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1494, in _train_with_estimator_spec
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 754, in run
run_metadata=run_metadata)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1259, in run
run_metadata=run_metadata)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1345, in run
return self._sess.run(*args, **kwargs)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/monitored_session.py", line 1426, in run
run_metadata=run_metadata))
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/basic_session_run_hooks.py", line 594, in after_run
if self._save(run_context.session, global_step):
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/training/basic_session_run_hooks.py", line 619, in _save
if l.after_save(session, step):
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 519, in after_save
self._evaluate(global_step_value) # updates self.eval_result
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 539, in _evaluate
self._evaluator.evaluate_and_export())
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/training.py", line 920, in evaluate_and_export
hooks=self._eval_spec.hooks)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 480, in evaluate
name=name)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 522, in _actual_eval
return _evaluate()
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 504, in _evaluate
self._evaluate_build_graph(input_fn, hooks, checkpoint_path))
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1511, in _evaluate_build_graph
self._call_model_fn_eval(input_fn, self.config))
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1547, in _call_model_fn_eval
features, labels, ModeKeys.EVAL, config)
File "tensorflow-1.15.2/python3.6/tensorflow_estimator/python/estimator/estimator.py", line 1149, in _call_model_fn
model_fn_results = self._model_fn(features=features, **kwargs)
File "content/models/research/object_detection/model_lib.py", line 570, in model_fn
eval_config, list(category_index.values()), eval_dict)
File "content/models/research/object_detection/eval_util.py", line 1045, in get_eval_metric_ops_for_evaluators
eval_dict))
File "content/models/research/object_detection/metrics/coco_evaluation.py", line 426, in get_estimator_eval_metric_ops
first_value_op = tf.py_func(first_value_func, [], tf.float32)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/util/deprecation.py", line 324, in new_func
return func(*args, **kwargs)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/script_ops.py", line 513, in py_func
return py_func_common(func, inp, Tout, stateful, name=name)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/script_ops.py", line 495, in py_func_common
func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/script_ops.py", line 318, in _internal_py_func
input=inp, token=token, Tout=Tout, name=name)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/gen_script_ops.py", line 170, in py_func
"PyFunc", input=input, token=token, Tout=Tout, name=name)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py", line 3357, in create_op
attrs, op_def, compute_device)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
op_def=op_def)
File "tensorflow-1.15.2/python3.6/tensorflow_core/python/framework/ops.py", line 1748, in __init__
self._traceback = tf_stack.extract_stack()
What are the possibilites of getting over this issue, any kind of recommendations?
Try adding these lines of code immediately after importing Tensorflow in your train.py
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

Do I have to restart colaboratory runtime every time?

I cannot run my code in Google Colaboratory twice without restarting runtime. Is there a way to run it without restarting runtime.
My code takes TensorD libraries and compute aproximation of a random 2x4x4 tensor using CP ALS algorithm. (this is an example taken from https://github.com/Large-Scale-Tensor-Decomposition/tensorD)
!git clone https://github.com/Large-Scale-Tensor-Decomposition/tensorD.git
import sys
import time
sys.path.append("/content/tensorD")
from tensorD.factorization.env import Environment
from tensorD.dataproc.provider import Provider
from tensorD.demo.DataGenerator import *
from tensorD.factorization.cp import CP_ALS
# generate a random tensor with shape 3x4x4
t = time.time()
X = synthetic_data_cp([3, 4, 4], 7)
data_provider = Provider()
data_provider.full_tensor = lambda: X
env = Environment(data_provider, summary_path='/tmp/cp_' + '7')
cp = CP_ALS(env)
args = CP_ALS.CP_Args(rank=7, validation_internal=1)
# build CP model with arguments
cp.build_model(args)
# train CP model with the maximum iteration of 100
cp.train(50)
# obtain factor matrices from trained model
factor_matrices = cp.factors
# obtain scaling vector from trained model
lambdas = cp.lambdas
for matrix in factor_matrices:
print(matrix)
elapsed = time.time() - t
print(elapsed)
when I run it first time I have no problem. When I run it again (without restart of runtime) I obtain:
CP model initial finish
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1333 try:
-> 1334 return fn(*args)
1335 except errors.OpError as e:
7 frames
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [3,4,4]
[[{{node Placeholder}}]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1346 pass
1347 message = error_interpolation.interpolate(message, self._graph)
-> 1348 raise type(e)(node_def, op, message)
1349
1350 def _extend_graph(self):
InvalidArgumentError: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [3,4,4]
[[node Placeholder (defined at /content/tensorD/tensorD/factorization/cp.py:69) ]]
Caused by op 'Placeholder', defined at:
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-4-4d2250ec007b>", line 21, in <module>
cp.build_model(args)
File "/content/tensorD/tensorD/factorization/cp.py", line 69, in build_model
input_data = tf.placeholder(tf.float32, shape=self._env.full_shape())
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py", line 2077, in placeholder
return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 5791, in placeholder
"Placeholder", dtype=dtype, shape=shape, name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
op_def=op_def)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
self._traceback = tf_stack.extract_stack()
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [3,4,4]
[[node Placeholder (defined at /content/tensorD/tensorD/factorization/cp.py:69) ]]
Any help will be appreciated!
This seems to mostly be a question about TensorFlow. Do you get what you want outside of Colab? I'm not totally clear about what you are expecting, but import tensorflow as tf; tf.reset_default_graph() at the top of your snippet seems sensible and squelches the error.

Tensorflow error while using SavedModelBuilder

I am trying to create a SavedModel(*.pb) file in tensorflow python. I will use this protobuf file in Java. However running the following code gives me a permission denied error. I am running tensorflow-gpu on a windows machine.
import cv2
import tensorflow as tf
import os
import numpy as np
from imutils import paths
from keras.preprocessing.image import img_to_array
def load_data(data_dir):
imagePaths = sorted(list(paths.list_images(data_dir)))
print("[INFO] loading images...")
data = []
labels = []
for imagePath in imagePaths:
# load the image, pre-process it, and store it in the data list
image = cv2.imread(imagePath)
reimage = cv2.resize(image, (64,16))
reimage=reimage.astype(np.float32)
reimage=np.multiply(reimage,1.0/255.0)
data.append(reimage)
label = imagePath.split(os.path.sep)[-2]
if label=="classa":
label=0
if label=="classb":
label=1
if label=="classc":
label=2
ll=int(label)
labels.append(ll)
data=np.array(data)
print(data.shape)
labels=np.array(labels)
print(labels.shape)
return data, labels
images_a,labels_a=load_data("ttt")
graph=tf.Graph()
with graph.as_default():
images_ph=tf.placeholder(tf.float32,[None,16,64,3],name='input')
labels_ph=tf.placeholder(tf.int32,[None],name='labels')
dataset=tf.data.Dataset.from_tensor_slices((images_a,labels_a)).repeat().batch(100)
iter=dataset.make_one_shot_iterator()
next_element=iter.get_next()
conv1 = tf.layers.conv2d(
inputs=images_ph,
filters=32,
kernel_size=[5, 5],
padding="same",
activation=tf.nn.relu)
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
pool2_flat = tf.reshape(pool1, [-1,32*8*32])
dense = tf.layers.dense(inputs=pool2_flat, units=1000, activation=tf.nn.relu)
dropout = tf.layers.dropout(inputs=dense, rate=0.4)
logits = tf.layers.dense(inputs=dropout, units=3)
predicted_labels=tf.argmax(logits,1)
loss=tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels_ph,logits=logits))
train=tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
init=tf.global_variables_initializer()
with tf.Session(graph=graph) as session:
session.run(init)
for i in range(101):
x,y=session.run(next_element)
loss_values=session.run([train,loss],feed_dict={images_ph:x,labels_ph:y})
if i%10==0:
print("Loss: ",loss_values)
builder=tf.saved_model.builder.SavedModelBuilder("/pModel")
builder.add_meta_graph_and_variables(session,[tf.saved_model.tag_constants.SERVING],
signature_def_map={
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:tf.saved_model.signature_def_utils.build_signature_def(
inputs={"input":tf.saved_model.utils.build_tensor_info(images_ph)},
outputs={"output":tf.saved_model.utils.build_tensor_info(logits)})
})
builder.save()
test_images,test_labels=load_data("test")
predicted=session.run(predicted_labels,feed_dict={images_ph:test_images})
match_count=sum([int(y==y_) for y,y_ in zip (test_labels, predicted)])
accuracy=match_count/len(test_labels)
print("Accuracy: {:.3f}".format(accuracy))
print(tf.saved_model.tag_constants.SERVING)
When this is run it gives me the following error:
2018-04-19 13:32:10.043715: W C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\35\tensorflow\core\framework\op_kernel.cc:1202] OP_REQUIRES failed at save_restore_v2_ops.cc:220 : Permission denied: Failed to create a directory: /; Permission denied
Traceback (most recent call last):
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1361, in _do_call
return fn(*args)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1340, in _run_fn
target_list, status, run_metadata)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 516, in __exit__
c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.PermissionDeniedError: Failed to create a directory: /; Permission denied
[[Node: save/MergeV2Checkpoints = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _arg_save/Const_0_0)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "tensorv4.py", line 94, in <module>
outputs={"output":tf.saved_model.utils.build_tensor_info(logits)})
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\saved_model\builder_impl.py", line 392, in add_meta_graph_and_variables
saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 1654, in save
{self.saver_def.filename_tensor_name: checkpoint_file})
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 905, in run
run_metadata_ptr)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1137, in _run
feed_dict_tensor, options, run_metadata)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1355, in _do_run
options, run_metadata)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1374, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.PermissionDeniedError: Failed to create a directory: /; Permission denied
[[Node: save/MergeV2Checkpoints = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _arg_save/Const_0_0)]]
Caused by op 'save/MergeV2Checkpoints', defined at:
File "tensorv4.py", line 94, in <module>
outputs={"output":tf.saved_model.utils.build_tensor_info(logits)})
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\saved_model\builder_impl.py", line 386, in add_meta_graph_and_variables
allow_empty=True)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 1293, in __init__
self.build()
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 1302, in build
self._build(self._filename, build_save=True, build_restore=True)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 1339, in _build
build_save=build_save, build_restore=build_restore)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 787, in _build_internal
save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 411, in _AddShardedSaveOps
return self._AddShardedSaveOpsForV2(filename_tensor, per_device)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\training\saver.py", line 393, in _AddShardedSaveOpsForV2
sharded_prefixes, checkpoint_prefix, delete_old_dirs=True)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\gen_io_ops.py", line 379, in merge_v2_checkpoints
delete_old_dirs=delete_old_dirs, name=name)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 3271, in create_op
op_def=op_def)
File "C:\Users\MoWHS\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1650, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
PermissionDeniedError (see above for traceback): Failed to create a directory: /; Permission denied
[[Node: save/MergeV2Checkpoints = MergeV2Checkpoints[delete_old_dirs=true, _device="/job:localhost/replica:0/task:0/device:CPU:0"](save/MergeV2Checkpoints/checkpoint_prefixes, _arg_save/Const_0_0)]]
Any idea on what is going wrong. I tried running with administrator privileges but to no avail

Keras stateful LSTM multi-gpu error Incompatible shapes: [2540] vs. [508] batch size multiplied buy nGPU

I had the same issue but when I tried to apply the same fix I have run into another error. I am however running on 5 gpus. I have read that you need to make sure that your samples are divisible by both the batch sive and number of gpus but I have done that. I have scoured the internet for days and am unable to find anything that has been able to fix the issue I am having. I am running keras v2.0.9 and tensor flow v1.1.0
VARIABLES:
attributeTables[0] is a numpy array shape (35560, 700)
y is a numpy array shape (35560, ) I have also tried using shape (35560, 1) for y but all that happens is the "Incompatible shapes: [2540] vs. [508]" changes from that to "Incompatible shapes: [2540, 1] vs. [508, 1]"
So this says to me that the issue is only with the targetsand that the expected batch size is getting multiplied somewhere in the middle of the process only for the targets and not for attributes causing a mismatch or at least only durring validation I'm not sure.
Here is the code and error in question.
import numpy as np
from keras.models import Sequential
from keras.utils import multi_gpu_model
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
def baseline_model():
# create model
print("Building Layers")
model = Sequential()
model.add(LSTM(700, batch_input_shape=(batchSize, X.shape[1], X.shape[2]), activation='tanh', return_sequences=False, stateful=True))
model.add(Dense(1))
print("Building Parallel model")
parallel_model = multi_gpu_model(model, gpus=nGPU)
# Compile model
#model.compile(loss='mean_squared_error', optimizer='adam')
print("Compiling Model")
parallel_model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
return parallel_model
def buildModel():
print("Bulding Model")
mlp = baseline_model()
print("Fitting Model")
return mlp.fit(X_train, y_train, epochs=1, batch_size=batchSize, shuffle=False, validation_data=(X_test, y_test))
print("Scaling")
scaler = StandardScaler()
X_Scaled = scaler.fit_transform(attributeTables[0])
print("Finding Batch Size")
nGPU = 5
batchSize = 500
while len(X_Scaled) % (batchSize * nGPU) != 0:
batchSize += 1
print("Filling Arrays")
X = X_Scaled.reshape((X_Scaled.shape[0], X_Scaled.shape[1], 1))
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)
print("Calling buildModel()")
model = buildModel()
print("Ploting History")
plt.plot(model.history['loss'], label='train')
plt.plot(model.history['val_loss'], label='test')
plt.legend()
plt.show()
Here is my complete output.
Beginning OHLC Load
Time took : 7.571000099182129
Making gloabal copies
Time took : 0.0
Using TensorFlow backend.
Scaling
Finding Batch Size
Filling Arrays
Calling buildModel()
Bulding Model
Building Layers
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_split.py:2010: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.
FutureWarning)
Building Parallel model
Compiling Model
Fitting Model
Train on 28448 samples, validate on 7112 samples
Epoch 1/1
Traceback (most recent call last):
File "<ipython-input-2-74c49f05bfbc>", line 1, in <module>
runfile('C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py', wdir='C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor')
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 77, in <module>
model = buildModel()
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 57, in buildModel
return mlp.fit(X_train, y_train, epochs=1, batch_size=batchSize, shuffle=False, validation_data=(X_test, y_test))
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1631, in fit
validation_steps=validation_steps)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1213, in _fit_loop
outs = f(ins_batch)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 2332, in __call__
**self.session_kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 778, in run
run_metadata_ptr)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 982, in _run
feed_dict_string, options, run_metadata)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1032, in _do_run
target_list, options, run_metadata)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1052, in _do_call
raise type(e)(node_def, op, message)
InvalidArgumentError: Incompatible shapes: [2540,1] vs. [508,1]
[[Node: training/Adam/gradients/loss/concatenate_1_loss/sub_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:#loss/concatenate_1_loss/sub"], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/loss/concatenate_1_loss/sub_grad/Shape, training/Adam/gradients/loss/concatenate_1_loss/sub_grad/Shape_1)]]
[[Node: replica_1/sequential_1/dense_1/BiasAdd/_313 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:1", send_device_incarnation=1, tensor_name="edge_1355_replica_1/sequential_1/dense_1/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Caused by op 'training/Adam/gradients/loss/concatenate_1_loss/sub_grad/BroadcastGradientArgs', defined at:
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 245, in <module>
main()
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 241, in main
kernel.start()
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
super(ZMQIOLoop, self).start()
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 832, in start
self._run_callback(self._callbacks.popleft())
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 605, in _run_callback
ret = callback()
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 265, in enter_eventloop
self.eventloop(self)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\eventloops.py", line 106, in loop_qt5
return loop_qt4(kernel)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\eventloops.py", line 99, in loop_qt4
_loop_qt(kernel.app)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\eventloops.py", line 83, in _loop_qt
app.exec_()
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\eventloops.py", line 39, in process_stream_events
kernel.do_one_iteration()
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 298, in do_one_iteration
stream.flush(zmq.POLLIN, 1)
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 352, in flush
self._handle_recv()
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
self._run_callback(callback, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
callback(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2808, in run_ast_nodes
if self.run_code(code, result):
File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-2-74c49f05bfbc>", line 1, in <module>
runfile('C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py', wdir='C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor')
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 710, in runfile
execfile(filename, namespace)
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 101, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 77, in <module>
model = buildModel()
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 57, in buildModel
return mlp.fit(X_train, y_train, epochs=1, batch_size=batchSize, shuffle=False, validation_data=(X_test, y_test))
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1608, in fit
self._make_train_function()
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 990, in _make_train_function
loss=self.total_loss)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\legacy\interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\optimizers.py", line 415, in get_updates
grads = self.get_gradients(loss, params)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\optimizers.py", line 73, in get_gradients
grads = K.gradients(loss, params)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py", line 2369, in gradients
return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 560, in gradients
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 368, in _MaybeCompile
return grad_fn() # Exit early
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gradients_impl.py", line 560, in <lambda>
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py", line 609, in _SubGrad
rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 411, in _broadcast_gradient_args
name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 768, in apply_op
op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1228, in __init__
self._traceback = _extract_stack()
...which was originally created as op 'loss/concatenate_1_loss/sub', defined at:
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\ipython\start_kernel.py", line 245, in <module>
main()
[elided 27 identical lines from previous traceback]
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 77, in <module>
model = buildModel()
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 55, in buildModel
mlp = baseline_model()
File "C:/Users/BeeAndTurtle/Documents/Programming/Python/Kraken_API_Market_Prediction/predictor/test.py", line 29, in baseline_model
parallel_model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 860, in compile
sample_weight, mask)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 460, in weighted
score_array = fn(y_true, y_pred)
File "C:\ProgramData\Anaconda3\lib\site-packages\keras\losses.py", line 13, in mean_absolute_error
return K.mean(K.abs(y_pred - y_true), axis=-1)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 821, in binary_op_wrapper
return func(x, y, name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 2627, in _sub
result = _op_def_lib.apply_op("Sub", x=x, y=y, name=name)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 768, in apply_op
op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2336, in create_op
original_op=self._default_original_op, op_def=op_def)
File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1228, in __init__
self._traceback = _extract_stack()
InvalidArgumentError (see above for traceback): Incompatible shapes: [2540,1] vs. [508,1]
[[Node: training/Adam/gradients/loss/concatenate_1_loss/sub_grad/BroadcastGradientArgs = BroadcastGradientArgs[T=DT_INT32, _class=["loc:#loss/concatenate_1_loss/sub"], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/loss/concatenate_1_loss/sub_grad/Shape, training/Adam/gradients/loss/concatenate_1_loss/sub_grad/Shape_1)]]
[[Node: replica_1/sequential_1/dense_1/BiasAdd/_313 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:1", send_device_incarnation=1, tensor_name="edge_1355_replica_1/sequential_1/dense_1/BiasAdd", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Daniel Moller's link was right when i disabled the parallel model and put it on one GPU the stateful worked no ptoblem. Currently waiting on it to train. Will post results.
I just published an experimental utility, stateful_multi_gpu, to handle stateful model training of multiple GPUs. I'm interested to know if it is of use to you.
Please also see my answer for the same question Daniel Möller referred to.

Resources