Could not create cudnn handle: CUDNN_STATUS_ALLOC_FAILED - python-3.x

HERE IS THE MODEL I AM USING:
#import tensorflow as tf
def create_model():
return tf.keras.models.Sequential([
#tf.keras.layers.Flatten(input_shape=(2,)),
tf.keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1,1),input_shape=(156,256,3),padding='valid',data_format='channels_last',
activation='relu',kernel_initializer=tf.keras.initializers.he_normal(seed=0),name='Conv1'),
tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2),padding='valid',data_format='channels_last',name='Pool1'),
tf.keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1,1),padding='valid',data_format='channels_last',
activation='relu',kernel_initializer=tf.keras.initializers.he_normal(seed=3),name='Conv2'),
tf.keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(2,2),padding='valid',data_format='channels_last',
activation='relu',kernel_initializer=tf.keras.initializers.he_normal(seed=5),name='Conv3'),
tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(1,1),padding='valid',data_format='channels_last',name='Pool2'),
tf.keras.layers.Conv2D(filters=128,kernel_size=(3,3),strides=(2,2),padding='valid',data_format='channels_last',
activation='relu',kernel_initializer=tf.keras.initializers.he_normal(seed=9),name='Conv4'),
tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2),padding='valid',data_format='channels_last',name='Pool3'),
tf.keras.layers.Flatten(data_format='channels_last',name='Flatten'),
tf.keras.layers.Dense(units=30,activation='relu',kernel_initializer=tf.keras.initializers.glorot_normal(seed=32),name='FC1'),
tf.keras.layers.Dense(units=15,activation='relu',kernel_initializer=tf.keras.initializers.glorot_normal(seed=33),name='FC2'),
tf.keras.layers.Dense(units=8,activation='softmax',kernel_initializer=tf.keras.initializers.glorot_normal(seed=3),name='Output'),
])
HERE IS THE ERROR I AM GETTING :
UnknownError Traceback (most recent call last)
<ipython-input-47-264c0fcc37e1> in <module>
1 ##fitting generator
----> 2 model.fit_generator(ImageGenerator,steps_per_epoch=216,epochs=3)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1295 shuffle=shuffle,
1296 initial_epoch=initial_epoch,
-> 1297 steps_name='steps_per_epoch')
1298
1299 def evaluate_generator(self,
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_generator.py in model_iteration(model, data, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch, mode, batch_size, steps_name, **kwargs)
263
264 is_deferred = not model._is_compiled
--> 265 batch_outs = batch_function(*batch_data)
266 if not isinstance(batch_outs, list):
267 batch_outs = [batch_outs]
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training.py in train_on_batch(self, x, y, sample_weight, class_weight, reset_metrics)
971 outputs = training_v2_utils.train_on_batch(
972 self, x, y=y, sample_weight=sample_weight,
--> 973 class_weight=class_weight, reset_metrics=reset_metrics)
974 outputs = (outputs['total_loss'] + outputs['output_losses'] +
975 outputs['metrics'])
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in train_on_batch(model, x, y, sample_weight, class_weight, reset_metrics)
262 y,
263 sample_weights=sample_weights,
--> 264 output_loss_metrics=model._output_loss_metrics)
265
266 if reset_metrics:
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in train_on_batch(model, inputs, targets, sample_weights, output_loss_metrics)
309 sample_weights=sample_weights,
310 training=True,
--> 311 output_loss_metrics=output_loss_metrics))
312 if not isinstance(outs, list):
313 outs = [outs]
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in _process_single_batch(model, inputs, targets, output_loss_metrics, sample_weights, training)
250 output_loss_metrics=output_loss_metrics,
251 sample_weights=sample_weights,
--> 252 training=training))
253 if total_loss is None:
254 raise ValueError('The model cannot be run '
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in _model_loss(model, inputs, targets, output_loss_metrics, sample_weights, training)
125 inputs = nest.map_structure(ops.convert_to_tensor, inputs)
126
--> 127 outs = model(inputs, **kwargs)
128 outs = nest.flatten(outs)
129
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
889 with base_layer_utils.autocast_context_manager(
890 self._compute_dtype):
--> 891 outputs = self.call(cast_inputs, *args, **kwargs)
892 self._handle_activity_regularization(inputs, outputs)
893 self._set_mask_metadata(inputs, outputs, input_masks)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py in call(self, inputs, training, mask)
254 if not self.built:
255 self._init_graph_network(self.inputs, self.outputs, name=self.name)
--> 256 return super(Sequential, self).call(inputs, training=training, mask=mask)
257
258 outputs = inputs # handle the corner case where self.layers is empty
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\network.py in call(self, inputs, training, mask)
706 return self._run_internal_graph(
707 inputs, training=training, mask=mask,
--> 708 convert_kwargs_to_constants=base_layer_utils.call_context().saving)
709
710 def compute_output_shape(self, input_shape):
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\network.py in _run_internal_graph(self, inputs, training, mask, convert_kwargs_to_constants)
858
859 # Compute outputs.
--> 860 output_tensors = layer(computed_tensors, **kwargs)
861
862 # Update tensor_dict.
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
889 with base_layer_utils.autocast_context_manager(
890 self._compute_dtype):
--> 891 outputs = self.call(cast_inputs, *args, **kwargs)
892 self._handle_activity_regularization(inputs, outputs)
893 self._set_mask_metadata(inputs, outputs, input_masks)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\layers\convolutional.py in call(self, inputs)
195
196 def call(self, inputs):
--> 197 outputs = self._convolution_op(inputs, self.kernel)
198
199 if self.use_bias:
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\nn_ops.py in __call__(self, inp, filter)
1132 call_from_convolution=False)
1133 else:
-> 1134 return self.conv_op(inp, filter)
1135 # copybara:strip_end
1136 # copybara:insert return self.conv_op(inp, filter)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\nn_ops.py in __call__(self, inp, filter)
637
638 def __call__(self, inp, filter): # pylint: disable=redefined-builtin
--> 639 return self.call(inp, filter)
640
641
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\nn_ops.py in __call__(self, inp, filter)
236 padding=self.padding,
237 data_format=self.data_format,
--> 238 name=self.name)
239
240
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\nn_ops.py in conv2d(input, filter, strides, padding, use_cudnn_on_gpu, data_format, dilations, name, filters)
2008 data_format=data_format,
2009 dilations=dilations,
-> 2010 name=name)
2011
2012
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\gen_nn_ops.py in conv2d(input, filter, strides, padding, use_cudnn_on_gpu, explicit_paddings, data_format, dilations, name)
1029 input, filter, strides=strides, use_cudnn_on_gpu=use_cudnn_on_gpu,
1030 padding=padding, explicit_paddings=explicit_paddings,
-> 1031 data_format=data_format, dilations=dilations, name=name, ctx=_ctx)
1032 except _core._SymbolicException:
1033 pass # Add nodes to the TensorFlow graph.
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\gen_nn_ops.py in conv2d_eager_fallback(input, filter, strides, padding, use_cudnn_on_gpu, explicit_paddings, data_format, dilations, name, ctx)
1128 explicit_paddings, "data_format", data_format, "dilations", dilations)
1129 _result = _execute.execute(b"Conv2D", 1, inputs=_inputs_flat, attrs=_attrs,
-> 1130 ctx=_ctx, name=name)
1131 _execute.record_gradient(
1132 "Conv2D", _inputs_flat, _attrs, _result, name)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
D:\anaconda\envs\tf_gpu\lib\site-packages\six.py in raise_from(value, from_value)
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above. [Op:Conv2D]
I am using tensorflow 2.0 installed with anaconda cuda version 10.2.
Can anyone please help me with this same installation works fine when i am not using cnn.
Is it because of i am using CONV2d or is it because i am using generator ?
I am on a windows 10 machine with 16 gb ram and 4gb nvidia 1650 graphics card.

Got the same error and resolved by below:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(gpus[0],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
(with GTX 1660, 6G memory)

Related

M1 MacBook Apple ML compute Tensorflow2.4 compatibility issue with Numpy

I am running the new apple native tensorflow package 2.4, and ran into a problem I did not have before. This jupyter notebook code works in old intel based environment where an older tensorflow version was used. But with M1 apple MLcomputer TensorFlow2.4 it is not compatible
with Numpy 1.20 or 1.18(I downgraded numpy to try). The error log:
NotImplementedError: Cannot convert a symbolic Tensor (lstm_1/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-20-73358e637fe3> in <module>
4 model = Sequential()
5 model.add(Embedding(vocab_size+1, W2V_SIZE, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
----> 6 model.add(LSTM(500, dropout=0.2, recurrent_dropout=0.2))
7 model.add(Dense(units = 10000, kernel_initializer = 'glorot_uniform', activation = 'relu'))
8 model.add(Dropout(0.35))
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/training/tracking/base.py in _method_wrapper(self, *args, **kwargs)
515 self._self_setattr_tracking = False # pylint: disable=protected-access
516 try:
--> 517 result = method(self, *args, **kwargs)
518 finally:
519 self._self_setattr_tracking = previous_value # pylint: disable=protected-access
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/engine/sequential.py in add(self, layer)
221 # If the model is being built continuously on top of an input layer:
222 # refresh its output.
--> 223 output_tensor = layer(self.outputs[0])
224 if len(nest.flatten(output_tensor)) != 1:
225 raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py in __call__(self, inputs, initial_state, constants, **kwargs)
658
659 if initial_state is None and constants is None:
--> 660 return super(RNN, self).__call__(inputs, **kwargs)
661
662 # If any of `initial_state` or `constants` are specified and are Keras
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, *args, **kwargs)
944 # >> model = tf.keras.Model(inputs, outputs)
945 if _in_functional_construction_mode(self, inputs, args, kwargs, input_list):
--> 946 return self._functional_construction_call(inputs, args, kwargs,
947 input_list)
948
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py in _functional_construction_call(self, inputs, args, kwargs, input_list)
1083 layer=self, inputs=inputs, build_graph=True, training=training_value):
1084 # Check input assumptions set after layer building, e.g. input shape.
-> 1085 outputs = self._keras_tensor_symbolic_call(
1086 inputs, input_masks, args, kwargs)
1087
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py in _keras_tensor_symbolic_call(self, inputs, input_masks, args, kwargs)
815 return nest.map_structure(keras_tensor.KerasTensor, output_signature)
816 else:
--> 817 return self._infer_output_signature(inputs, args, kwargs, input_masks)
818
819 def _infer_output_signature(self, inputs, args, kwargs, input_masks):
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py in _infer_output_signature(self, inputs, args, kwargs, input_masks)
856 # TODO(kaftan): do we maybe_build here, or have we already done it?
857 self._maybe_build(inputs)
--> 858 outputs = call_fn(inputs, *args, **kwargs)
859
860 self._handle_activity_regularization(inputs, outputs)
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent_v2.py in call(self, inputs, mask, training, initial_state)
1161 # LSTM does not support constants. Ignore it during process.
1162 orig_initial_state = initial_state
-> 1163 inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
1164
1165 if isinstance(mask, list):
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py in _process_inputs(self, inputs, initial_state, constants)
857 initial_state = self.states
858 elif initial_state is None:
--> 859 initial_state = self.get_initial_state(inputs)
860
861 if len(initial_state) != len(self.states):
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py in get_initial_state(self, inputs)
640 dtype = inputs.dtype
641 if get_initial_state_fn:
--> 642 init_state = get_initial_state_fn(
643 inputs=None, batch_size=batch_size, dtype=dtype)
644 else:
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py in get_initial_state(self, inputs, batch_size, dtype)
2504
2505 def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-> 2506 return list(_generate_zero_filled_state_for_cell(
2507 self, inputs, batch_size, dtype))
2508
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py in _generate_zero_filled_state_for_cell(cell, inputs, batch_size, dtype)
2985 batch_size = array_ops.shape(inputs)[0]
2986 dtype = inputs.dtype
-> 2987 return _generate_zero_filled_state(batch_size, cell.state_size, dtype)
2988
2989
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py in _generate_zero_filled_state(batch_size_tensor, state_size, dtype)
3001
3002 if nest.is_nested(state_size):
-> 3003 return nest.map_structure(create_zeros, state_size)
3004 else:
3005 return create_zeros(state_size)
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/util/nest.py in map_structure(func, *structure, **kwargs)
657
658 return pack_sequence_as(
--> 659 structure[0], [func(*x) for x in entries],
660 expand_composites=expand_composites)
661
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/util/nest.py in <listcomp>(.0)
657
658 return pack_sequence_as(
--> 659 structure[0], [func(*x) for x in entries],
660 expand_composites=expand_composites)
661
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/keras/layers/recurrent.py in create_zeros(unnested_state_size)
2998 flat_dims = tensor_shape.TensorShape(unnested_state_size).as_list()
2999 init_state_size = [batch_size_tensor] + flat_dims
-> 3000 return array_ops.zeros(init_state_size, dtype=dtype)
3001
3002 if nest.is_nested(state_size):
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py in wrapper(*args, **kwargs)
199 """Call target, and fall back on dispatchers if there is a TypeError."""
200 try:
--> 201 return target(*args, **kwargs)
202 except (TypeError, ValueError):
203 # Note: convert_to_eager_tensor currently raises a ValueError, not a
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py in wrapped(*args, **kwargs)
2817
2818 def wrapped(*args, **kwargs):
-> 2819 tensor = fun(*args, **kwargs)
2820 tensor._is_zeros_tensor = True
2821 return tensor
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py in zeros(shape, dtype, name)
2866 # Create a constant if it won't be very big. Otherwise create a fill
2867 # op to prevent serialized GraphDefs from becoming too large.
-> 2868 output = _constant_if_small(zero, shape, dtype, name)
2869 if output is not None:
2870 return output
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py in _constant_if_small(value, shape, dtype, name)
2802 def _constant_if_small(value, shape, dtype, name):
2803 try:
-> 2804 if np.prod(shape) < 1000:
2805 return constant(value, shape=shape, dtype=dtype, name=name)
2806 except TypeError:
<__array_function__ internals> in prod(*args, **kwargs)
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/numpy/core/fromnumeric.py in prod(a, axis, dtype, out, keepdims, initial, where)
3028 10
3029 """
-> 3030 return _wrapreduction(a, np.multiply, 'prod', axis, dtype, out,
3031 keepdims=keepdims, initial=initial, where=where)
3032
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/numpy/core/fromnumeric.py in _wrapreduction(obj, ufunc, method, axis, dtype, out, **kwargs)
85 return reduction(axis=axis, out=out, **passkwargs)
86
---> 87 return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
88
89
~/miniforge3/envs/tf2.4/lib/python3.8/site-packages/tensorflow/python/framework/ops.py in __array__(self)
850
851 def __array__(self):
--> 852 raise NotImplementedError(
853 "Cannot convert a symbolic Tensor ({}) to a numpy array."
854 " This error may indicate that you're trying to pass a Tensor to"
NotImplementedError: Cannot convert a symbolic Tensor (lstm_1/strided_slice:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported

Code crashes with message : Failed to get convolution algorithm

I am clueless as to why I keep getting this error. I used the same CNN model to train MNIST dataset, but I did not face any issue previously. Out of nowhere, I start getting this issue. I haven't installed any libraries during that time frame, my gpu drivers are up to date.
I also did a fresh install of CUDA 10.1 with cuDNN v8.0.4 (for cuda 10.1), using tensorflow version 2.3.0 and Anaconda version 2020.07
This is the model:
model=Sequential()
model.add(Conv2D(64,filter_size1,strides=(1,1),input_shape=(None,None,1), data_format='channels_last'))
model.add(Conv2D(43,filter_size2,input_shape=(None,None,64), data_format='channels_last'))
model.add(Conv2D(29,filter_size2,input_shape=(None,None,43), data_format='channels_last'))
model.add(Conv2D(19,filter_size2,input_shape=(None,None,29), data_format='channels_last'))
model.add(Conv2D(10,filter_size2, input_shape=(None,None,19), data_format='channels_last'))
model.add(GlobalAveragePooling2D())
model.add(Activation(activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train, to_categorical(y_train), epochs=5)
This is the error I've been getting:
UnknownError Traceback (most recent call last)
<ipython-input-23-8765eb732021> in <module>
----> 1 model.fit(X_train, to_categorical(y_train), epochs=5)
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\keras\engine\training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
838 # Lifting succeeded, so variables are initialized and we can run the
839 # stateless function.
--> 840 return self._stateless_fn(*args, **kwds)
841 else:
842 canon_args, canon_kwds = \
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
2827 with self._lock:
2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2830
2831 #property
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\function.py in _filtered_call(self, args, kwargs, cancellation_manager)
1841 `args` and `kwargs`.
1842
-> 1843 return self._call_flat(
1844 [t for t in nest.flatten((args, kwargs), expand_composites=True)
1845 if isinstance(t, (ops.Tensor,
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1921 and executing_eagerly):
1922 # No tape is watching; skip to running the function.
-> 1923 return self._build_call_outputs(self._inference_function.call(
1924 ctx, args, cancellation_manager=cancellation_manager))
1925 forward_backward = self._select_forward_and_backward_functions(
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
543 with _InterpolateFunctionError(self):
544 if cancellation_manager is None:
--> 545 outputs = execute.execute(
546 str(self.signature.name),
547 num_outputs=self._num_outputs,
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
57 try:
58 ctx.ensure_initialized()
---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node sequential_2/conv2d_9/Conv2D (defined at <ipython-input-23-8765eb732021>:1) ]] [Op:__inference_train_function_4211]
Function call stack:
train_function
Any help would be greatly appreciated!
Edit:
I loaded a model that I previously saved, and it seems to be working fine. But besides that model, no model is being executed.
I searched for quite a while on GitHub pages, and found out this chunk of code:
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
As far as I can understand, it is limiting the amount of GPU memory being used. Please do correct me if I'm wrong, might help someone else facing the same issue.

Function call stack: train_function -> train_function -> train_function

My training data is a list of different length numpy arrays. For example,
x_train[0] = [[ 0.67836523 0.39823654 0.9661108 ... 0.19785157 0.1766675
0.6182588 ]
[-1.664766 -0.360997 0.096446 ... -0.635498 0.300886
-0.045028 ]
[-0.615297 -0.190688 -0.226994 ... 1.648792 -1.691676
-0.411259 ]
...
[-1.380328 -0.231574 -0.078576 ... 1.54852 -1.323094
1.493816 ]
[-2.35968 -4.016114 1.077576 ... -1.23973 -0.65608
1.095033 ]
[ 0.551824 0.115759 -0.163607 ... -0.285045 0.472944
-0.664072 ]]
Here is examples of x_train y_train dimension:
x_train[0].shape = (1136, 512) x_train[1].shape = (650, 512)...etc
y_train[0].shape = (1136, 19) y_train[1] = (650, 19)...etc
Here is my model:
model = Sequential()
model.add(GRU(128, return_sequences=True, input_shape=(None, 512)))
model.add(GRU(128, return_sequences=True))
model.add(TimeDistributed(Dense(19, activation='softmax')))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
plot_model(
model,
to_file='model.png',
show_shapes=True,
show_layer_names=False,
rankdir='LR')
And a data generator for creating batch of similar sequence length:
def train_generator(x, y):
while True:
index = np.random.randint(len(x))
Xb = np.expand_dims(x[index], axis=0)
yb = np.expand_dims(y[index], axis=0)
for i in range(len(x)):
if (0 <= len(x[i]) - len(x[index]) <= 200) & (i != index):
x_tmp = np.expand_dims(x[i][:len(x[index])], axis=0)
y_tmp = np.expand_dims(y[i][:len(y[index])], axis=0)
Xb = np.append(x_tmp, Xb, 0)
yb = np.append(y_tmp, yb, 0)
yield (Xb, yb)
Then fit:
model.fit_generator(train_generator(x_train, y_train), epochs=10, verbose=1)
Here is the error I'm getting:
Epoch 1/10
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
<ipython-input-324-9e3ade034201> in <module>
----> 1 model.fit_generator(train_generator(x_train, y_train), epochs=10, verbose=1)
~\miniconda3\lib\site-packages\tensorflow\python\util\deprecation.py in new_func(*args, **kwargs)
322 'in a future version' if date is None else ('after %s' % date),
323 instructions)
--> 324 return func(*args, **kwargs)
325 return tf_decorator.make_decorator(
326 func, new_func, 'deprecated',
~\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1827 use_multiprocessing=use_multiprocessing,
1828 shuffle=shuffle,
-> 1829 initial_epoch=initial_epoch)
1830
1831 #deprecation.deprecated(
~\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
~\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
~\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
838 # Lifting succeeded, so variables are initialized and we can run the
839 # stateless function.
--> 840 return self._stateless_fn(*args, **kwds)
841 else:
842 canon_args, canon_kwds = \
~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
2827 with self._lock:
2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2830
2831 #property
~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py in _filtered_call(self, args, kwargs, cancellation_manager)
1846 resource_variable_ops.BaseResourceVariable))],
1847 captured_inputs=self.captured_inputs,
-> 1848 cancellation_manager=cancellation_manager)
1849
1850 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1922 # No tape is watching; skip to running the function.
1923 return self._build_call_outputs(self._inference_function.call(
-> 1924 ctx, args, cancellation_manager=cancellation_manager))
1925 forward_backward = self._select_forward_and_backward_functions(
1926 args,
~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
548 inputs=args,
549 attrs=attrs,
--> 550 ctx=ctx)
551 else:
552 outputs = execute.execute_with_cancellation(
~\miniconda3\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
UnknownError: Fail to find the dnn implementation.
[[{{node CudnnRNN}}]]
[[sequential_18/gru_36/PartitionedCall]] [Op:__inference_train_function_54574]
Function call stack:
train_function -> train_function -> train_function
I've tried googling the error myself, but didn't found any solution.
Any help would be massively appreciated :)

IndexError: list index out of range in model.fit_generator of keras

I try to train my network and confront this error in model.fit_generator.
model.fit_generator(
train_gen, validation_data=valid_gen,
epochs=NUM_EPOCHS, steps_per_epoch=len(train_gen),
validation_steps=len(valid_gen)
)
An Error IndexError: list index out of range is reported, I will attach the Traceback at the end since it is too long.
train_gen and valid_gen is created by flow_from_directory. I've inspected the len(train_gen) and len(valid_gen), but I can't see where the problem resides.
If any further information needed, please let me know, thanks a lot!
Traceback attached
IndexError Traceback (most recent call last)
<ipython-input-21-d09c3e50a215> in <module>
1 history = model.fit_generator(train_gen, validation_data=valid_gen,
2 epochs=NUM_EPOCHS, steps_per_epoch=len(train_gen),
----> 3 validation_steps=len(valid_gen))
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1416 use_multiprocessing=use_multiprocessing,
1417 shuffle=shuffle,
-> 1418 initial_epoch=initial_epoch)
1419
1420 #interfaces.legacy_generator_methods_support
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
179 batch_index = 0
180 while steps_done < steps_per_epoch:
--> 181 generator_output = next(output_generator)
182
183 if not hasattr(generator_output, '__len__'):
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras/utils/data_utils.py in get(self)
707 "`use_multiprocessing=False, workers > 1`."
708 "For more information see issue #1638.")
--> 709 six.reraise(*sys.exc_info())
~/anaconda3/envs/fastai/lib/python3.7/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras/utils/data_utils.py in get(self)
683 try:
684 while self.is_running():
--> 685 inputs = self.queue.get(block=True).get()
686 self.queue.task_done()
687 if inputs is not None:
~/anaconda3/envs/fastai/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
681 return self._value
682 else:
--> 683 raise self._value
684
685 def _set(self, i, obj):
~/anaconda3/envs/fastai/lib/python3.7/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras/utils/data_utils.py in next_sample(uid)
624 The next value of generator `uid`.
625 """
--> 626 return six.next(_SHARED_SEQUENCES[uid])
627
628
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras_preprocessing/image/iterator.py in __next__(self, *args, **kwargs)
102
103 def __next__(self, *args, **kwargs):
--> 104 return self.next(*args, **kwargs)
105
106 def next(self):
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras_preprocessing/image/iterator.py in next(self)
114 # The transformation of images is not under thread lock
115 # so it can be done in parallel
--> 116 return self._get_batches_of_transformed_samples(index_array)
117
118 def _get_batches_of_transformed_samples(self, index_array):
~/anaconda3/envs/fastai/lib/python3.7/site-packages/keras_preprocessing/image/iterator.py in _get_batches_of_transformed_samples(self, index_array)
225 filepaths = self.filepaths
226 for i, j in enumerate(index_array):
--> 227 img = load_img(filepaths[j],
228 color_mode=self.color_mode,
229 target_size=self.target_size,
IndexError: list index out of range

CUDA runtime implicit initialization on GPU:0 failed. Status: unknown error

For ImageDataGenerator using keras library i use the following code.Number of training set is 8000 and number of test set is 2000.
classifier.fit_generator(
generator = training_set,
steps_per_epoch=8000,
validation_data = test_set,
validation_steps = 2000,
epochs=25)
But when i run the code i get the following errors.Is there any problem in my tensorflow version or keras version? Currently i am using keras version = 2.2.4, python=3.6, tensorflow version = 1.11.
InternalError Traceback (most recent call last)
<ipython-input-88-16fbb44d18e3> in <module>()
4 validation_data = test_set,
5 validation_steps = 2000,
----> 6 epochs=25)
~\Anaconda3\lib\site-packages\keras\legacy\interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
~\Anaconda3\lib\site-packages\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1416 use_multiprocessing=use_multiprocessing,
1417 shuffle=shuffle,
-> 1418 initial_epoch=initial_epoch)
1419
1420 #interfaces.legacy_generator_methods_support
~\Anaconda3\lib\site-packages\keras\engine\training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
215 outs = model.train_on_batch(x, y,
216 sample_weight=sample_weight,
--> 217 class_weight=class_weight)
218
219 outs = to_list(outs)
~\Anaconda3\lib\site-packages\keras\engine\training.py in train_on_batch(self, x, y, sample_weight, class_weight)
1215 ins = x + y + sample_weights
1216 self._make_train_function()
-> 1217 outputs = self.train_function(ins)
1218 return unpack_singleton(outputs)
1219
~\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py in __call__(self, inputs)
2695
2696 def __call__(self, inputs):
-> 2697 if hasattr(get_session(), '_make_callable_from_options'):
2698 if py_any(is_sparse(x) for x in self.inputs):
2699 if py_any(is_tensor(x) for x in inputs):
~\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py in get_session()
184 config = tf.ConfigProto(intra_op_parallelism_threads=num_thread,
185 allow_soft_placement=True)
--> 186 _SESSION = tf.Session(config=config)
187 session = _SESSION
188 if not _MANUAL_VAR_INIT:
~\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in __init__(self, target, graph, config)
1509
1510 """
-> 1511 super(Session, self).__init__(target, graph, config=config)
1512 # NOTE(mrry): Create these on first `__enter__` to avoid a reference cycle.
1513 self._default_graph_context_manager = None
~\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in __init__(self, target, graph, config)
632 try:
633 # pylint: disable=protected-access
--> 634 self._session = tf_session.TF_NewSessionRef(self._graph._c_graph, opts)
635 # pylint: enable=protected-access
636 finally:
InternalError: CUDA runtime implicit initialization on GPU:0 failed. Status: unknown error

Resources