Function call stack: train_function -> train_function -> train_function - python-3.x

My training data is a list of different length numpy arrays. For example,
x_train[0] = [[ 0.67836523 0.39823654 0.9661108 ... 0.19785157 0.1766675
0.6182588 ]
[-1.664766 -0.360997 0.096446 ... -0.635498 0.300886
-0.045028 ]
[-0.615297 -0.190688 -0.226994 ... 1.648792 -1.691676
-0.411259 ]
...
[-1.380328 -0.231574 -0.078576 ... 1.54852 -1.323094
1.493816 ]
[-2.35968 -4.016114 1.077576 ... -1.23973 -0.65608
1.095033 ]
[ 0.551824 0.115759 -0.163607 ... -0.285045 0.472944
-0.664072 ]]
Here is examples of x_train y_train dimension:
x_train[0].shape = (1136, 512) x_train[1].shape = (650, 512)...etc
y_train[0].shape = (1136, 19) y_train[1] = (650, 19)...etc
Here is my model:
model = Sequential()
model.add(GRU(128, return_sequences=True, input_shape=(None, 512)))
model.add(GRU(128, return_sequences=True))
model.add(TimeDistributed(Dense(19, activation='softmax')))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
plot_model(
model,
to_file='model.png',
show_shapes=True,
show_layer_names=False,
rankdir='LR')
And a data generator for creating batch of similar sequence length:
def train_generator(x, y):
while True:
index = np.random.randint(len(x))
Xb = np.expand_dims(x[index], axis=0)
yb = np.expand_dims(y[index], axis=0)
for i in range(len(x)):
if (0 <= len(x[i]) - len(x[index]) <= 200) & (i != index):
x_tmp = np.expand_dims(x[i][:len(x[index])], axis=0)
y_tmp = np.expand_dims(y[i][:len(y[index])], axis=0)
Xb = np.append(x_tmp, Xb, 0)
yb = np.append(y_tmp, yb, 0)
yield (Xb, yb)
Then fit:
model.fit_generator(train_generator(x_train, y_train), epochs=10, verbose=1)
Here is the error I'm getting:
Epoch 1/10
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
<ipython-input-324-9e3ade034201> in <module>
----> 1 model.fit_generator(train_generator(x_train, y_train), epochs=10, verbose=1)
~\miniconda3\lib\site-packages\tensorflow\python\util\deprecation.py in new_func(*args, **kwargs)
322 'in a future version' if date is None else ('after %s' % date),
323 instructions)
--> 324 return func(*args, **kwargs)
325 return tf_decorator.make_decorator(
326 func, new_func, 'deprecated',
~\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1827 use_multiprocessing=use_multiprocessing,
1828 shuffle=shuffle,
-> 1829 initial_epoch=initial_epoch)
1830
1831 #deprecation.deprecated(
~\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
~\miniconda3\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
~\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~\miniconda3\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
838 # Lifting succeeded, so variables are initialized and we can run the
839 # stateless function.
--> 840 return self._stateless_fn(*args, **kwds)
841 else:
842 canon_args, canon_kwds = \
~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
2827 with self._lock:
2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2830
2831 #property
~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py in _filtered_call(self, args, kwargs, cancellation_manager)
1846 resource_variable_ops.BaseResourceVariable))],
1847 captured_inputs=self.captured_inputs,
-> 1848 cancellation_manager=cancellation_manager)
1849
1850 def _call_flat(self, args, captured_inputs, cancellation_manager=None):
~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1922 # No tape is watching; skip to running the function.
1923 return self._build_call_outputs(self._inference_function.call(
-> 1924 ctx, args, cancellation_manager=cancellation_manager))
1925 forward_backward = self._select_forward_and_backward_functions(
1926 args,
~\miniconda3\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
548 inputs=args,
549 attrs=attrs,
--> 550 ctx=ctx)
551 else:
552 outputs = execute.execute_with_cancellation(
~\miniconda3\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
58 ctx.ensure_initialized()
59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
---> 60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
62 if name is not None:
UnknownError: Fail to find the dnn implementation.
[[{{node CudnnRNN}}]]
[[sequential_18/gru_36/PartitionedCall]] [Op:__inference_train_function_54574]
Function call stack:
train_function -> train_function -> train_function
I've tried googling the error myself, but didn't found any solution.
Any help would be massively appreciated :)

Related

ValueError: The channel dimension of the inputs should be defined. Found `None`

I am extremely new to Tensorflow hence I won't be sure exactly what will you need to solve my issue. So do let me know if you need any additional information.
Basically I'm trying to run images through Sequential. Based on the tutorial on https://www.tensorflow.org/tutorials/images/classification, I am trying to plug and play onto my own dataset.
I'm currently stuck at the running my model using model.fit() where it gave me the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-90-85c03bda7f8f> in <module>
16
17 epochs=1
---> 18 history = model.fit(
19 train_data,
20 validation_data=test_data,
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1132 _r=1):
1133 callbacks.on_train_batch_begin(step)
-> 1134 tmp_logs = self.train_function(iterator)
1135 if data_handler.should_sync:
1136 context.async_wait()
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
816 tracing_count = self.experimental_get_tracing_count()
817 with trace.Trace(self._name) as tm:
--> 818 result = self._call(*args, **kwds)
819 compiler = "xla" if self._jit_compile else "nonXla"
820 new_tracing_count = self.experimental_get_tracing_count()
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
860 # This is the first call of __call__, so we have to initialize.
861 initializers = []
--> 862 self._initialize(args, kwds, add_initializers_to=initializers)
863 finally:
864 # At this point we know that the initialization is complete (or less
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
701 self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
702 self._concrete_stateful_fn = (
--> 703 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
704 *args, **kwds))
705
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
3018 args, kwargs = None, None
3019 with self._lock:
-> 3020 graph_function, _ = self._maybe_define_function(args, kwargs)
3021 return graph_function
3022
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
3412
3413 self._function_cache.missed.add(call_context_key)
-> 3414 graph_function = self._create_graph_function(args, kwargs)
3415 self._function_cache.primary[cache_key] = graph_function
3416
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
3247 arg_names = base_arg_names + missing_arg_names
3248 graph_function = ConcreteFunction(
-> 3249 func_graph_module.func_graph_from_py_func(
3250 self._name,
3251 self._python_function,
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
996 _, original_func = tf_decorator.unwrap(python_func)
997
--> 998 func_outputs = python_func(*func_args, **func_kwargs)
999
1000 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
610 xla_context.Exit()
611 else:
--> 612 out = weak_wrapped_fn().__wrapped__(*args, **kwds)
613 return out
614
~/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
983 except Exception as e: # pylint:disable=broad-except
984 if hasattr(e, "ag_error_metadata"):
--> 985 raise e.ag_error_metadata.to_exception(e)
986 else:
987 raise
ValueError: in user code:
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:839 train_function *
return step_function(self, iterator)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:829 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1262 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2734 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:3423 _call_for_each_replica
return fn(*args, **kwargs)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:822 run_step **
outputs = model.train_step(data)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:788 train_step
y_pred = self(x, training=True)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1032 __call__
outputs = call_fn(inputs, *args, **kwargs)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/sequential.py:398 call
outputs = layer(inputs, **kwargs)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:1028 __call__
self._maybe_build(inputs)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:2722 _maybe_build
self.build(input_shapes) # pylint:disable=not-callable
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/layers/convolutional.py:188 build
input_channel = self._get_input_channel(input_shape)
/Users/mongchanghsi/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/layers/convolutional.py:367 _get_input_channel
raise ValueError('The channel dimension of the inputs '
ValueError: The channel dimension of the inputs should be defined. Found `None`.
Here is my code for the model:
model = Sequential([
layers.Conv2D(16, 3, padding='same', activation='relu'),
layers.MaxPooling2D(),
layers.Conv2D(32, 3, padding='same', activation='relu'),
layers.MaxPooling2D(),
layers.Conv2D(64, 3, padding='same', activation='relu'),
layers.MaxPooling2D(),
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.Dense(4)
])
model.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
epochs=10
history = model.fit(
train_data,
validation_data=test_data,
epochs=epochs
)
I understand that in the tutorial they used a inbuilt preprocessing function however I tried to build my own preprocessing function to facilitate my learning as well.
def preprocessing(image, target_size):
# Extracting labels
parts = tf.strings.split(image, os.sep)
label = parts[-2]
# Decoding image file
path = tf.io.read_file(image)
image = tf.image.decode_jpeg(path)
# Cropping
image = tf.image.crop_to_bounding_box(image, offset_height=25, offset_width=25, target_height=image_size, target_width=image_size)
# Normalizing
image = image / 255
return image, label
list_ds = tf.data.Dataset.list_files(DATA_DIR + '/*/*')
preprocess_function = partial(preprocessing, target_size=image_size)
processed_data = list_ds.map(preprocess_function)
train_data = processed_data.take(8000).batch(batch_size)
test_data = processed_data.skip(8000).batch(batch_size)
Other information that I can provide is that the images are of grey-scale hence 1 channel and I have normalized it /255 in my preprocessing function and the image_size is 300 and batch_size is 100.
Try this:
image = tf.image.decode_jpeg(path, channels=1)

Code crashes with message : Failed to get convolution algorithm

I am clueless as to why I keep getting this error. I used the same CNN model to train MNIST dataset, but I did not face any issue previously. Out of nowhere, I start getting this issue. I haven't installed any libraries during that time frame, my gpu drivers are up to date.
I also did a fresh install of CUDA 10.1 with cuDNN v8.0.4 (for cuda 10.1), using tensorflow version 2.3.0 and Anaconda version 2020.07
This is the model:
model=Sequential()
model.add(Conv2D(64,filter_size1,strides=(1,1),input_shape=(None,None,1), data_format='channels_last'))
model.add(Conv2D(43,filter_size2,input_shape=(None,None,64), data_format='channels_last'))
model.add(Conv2D(29,filter_size2,input_shape=(None,None,43), data_format='channels_last'))
model.add(Conv2D(19,filter_size2,input_shape=(None,None,29), data_format='channels_last'))
model.add(Conv2D(10,filter_size2, input_shape=(None,None,19), data_format='channels_last'))
model.add(GlobalAveragePooling2D())
model.add(Activation(activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X_train, to_categorical(y_train), epochs=5)
This is the error I've been getting:
UnknownError Traceback (most recent call last)
<ipython-input-23-8765eb732021> in <module>
----> 1 model.fit(X_train, to_categorical(y_train), epochs=5)
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\keras\engine\training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
838 # Lifting succeeded, so variables are initialized and we can run the
839 # stateless function.
--> 840 return self._stateless_fn(*args, **kwds)
841 else:
842 canon_args, canon_kwds = \
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\function.py in __call__(self, *args, **kwargs)
2827 with self._lock:
2828 graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
-> 2829 return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
2830
2831 #property
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\function.py in _filtered_call(self, args, kwargs, cancellation_manager)
1841 `args` and `kwargs`.
1842
-> 1843 return self._call_flat(
1844 [t for t in nest.flatten((args, kwargs), expand_composites=True)
1845 if isinstance(t, (ops.Tensor,
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1921 and executing_eagerly):
1922 # No tape is watching; skip to running the function.
-> 1923 return self._build_call_outputs(self._inference_function.call(
1924 ctx, args, cancellation_manager=cancellation_manager))
1925 forward_backward = self._select_forward_and_backward_functions(
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\function.py in call(self, ctx, args, cancellation_manager)
543 with _InterpolateFunctionError(self):
544 if cancellation_manager is None:
--> 545 outputs = execute.execute(
546 str(self.signature.name),
547 num_outputs=self._num_outputs,
~\.conda\envs\GPUEnv\lib\site-packages\tensorflow\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
57 try:
58 ctx.ensure_initialized()
---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
[[node sequential_2/conv2d_9/Conv2D (defined at <ipython-input-23-8765eb732021>:1) ]] [Op:__inference_train_function_4211]
Function call stack:
train_function
Any help would be greatly appreciated!
Edit:
I loaded a model that I previously saved, and it seems to be working fine. But besides that model, no model is being executed.
I searched for quite a while on GitHub pages, and found out this chunk of code:
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
As far as I can understand, it is limiting the amount of GPU memory being used. Please do correct me if I'm wrong, might help someone else facing the same issue.

How can I implement VGG-net on a dataset of different shape?

I am trying to use a part of the VGG16 model for transfer learning using the Fashion MNIST dataset. The data is processed and the model is specified as per below:
data = keras.datasets.fashion_mnist
(train_img, train_labels), (test_img, test_labels) = data.load_data()
train_img.shape, train_labels.shape, test_img.shape, test_labels.shape
#((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))
# transform to rgb as required by VGG
train_img=tf.image.grayscale_to_rgb(tf.expand_dims(train_img, axis=3))
test_img=tf.image.grayscale_to_rgb(tf.expand_dims(test_img, axis=3))
#resize to minimum size of (32x32
train_img=tf.image.resize_with_pad(train_img,32,32)
test_img=tf.image.resize_with_pad(train_img,32,32)
train_img = train_img / 255.
test_img = test_img / 255.
from keras.applications.vgg16 import preprocess_input
train_img = tf.expand_dims(train_img, axis=0)
test_img = tf.expand_dims(test_img, axis=0)
#preprocessing as required by VGG16
train_img=preprocess_input(train_img)
test_img=preprocess_input(test_img)
#using model without last layers
vgg16=tf.keras.applications.VGG16(include_top=False, weights='imagenet', input_shape=(32,32,3))
layer_dict = dict([(layer.name, layer) for layer in vgg16.layers])
#stop at block3_pool and get output
output = layer_dict['block3_pool'].output
x = keras.layers.Flatten()(output)
...add some fully connected layers here...
x = keras.layers.Dense(10, activation='softmax')(x)
final = keras.models.Model(inputs=vgg16.input, outputs=model)
for layer in final.layers[:7]:
layer.trainable = False
final.fit(train_img, train_labels, epochs=50, validation_split=0.2)
When I try to fit the model I get the following error:
UnboundLocalError Traceback (most recent call last)
<ipython-input-65-6a0b99b56337> in <module>()
1 early_stopping_cb=keras.callbacks.EarlyStopping(patience=3, verbose=1,restore_best_weights=True)
----> 2 vgg16_1.fit(train_img, train_labels, epochs=50, validation_split=0.2, callbacks=[early_stopping_cb])
1 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
857 logs = tmp_logs # No error, now safe to assign to logs.
858 callbacks.on_train_batch_end(step, logs)
--> 859 epoch_logs = copy.copy(logs)
860
861 # Run validation.
UnboundLocalError: local variable 'logs' referenced before assignment
I thought this might be due to the training set shape being faulty, but then if I use train_img[0] instead, which has shape (60000,32,32,3), then I get the following error instead:
ValueError Traceback (most recent call last)
<ipython-input-66-2b893ccd9ac9> in <module>()
1 early_stopping_cb=keras.callbacks.EarlyStopping(patience=3, verbose=1,restore_best_weights=True)
----> 2 vgg16_1.fit(train_img[0], train_labels, epochs=50, validation_split=0.2, callbacks=[early_stopping_cb])
10 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
64 def _method_wrapper(self, *args, **kwargs):
65 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
---> 66 return method(self, *args, **kwargs)
67
68 # Running inside `run_distribute_coordinator` already.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
849 batch_size=batch_size):
850 callbacks.on_train_batch_begin(step)
--> 851 tmp_logs = train_function(iterator)
852 # Catch OutOfRangeError for Datasets of unknown size.
853 # This blocks until the batch has finished executing.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
578 xla_context.Exit()
579 else:
--> 580 result = self._call(*args, **kwds)
581
582 if tracing_count == self._get_tracing_count():
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
625 # This is the first call of __call__, so we have to initialize.
626 initializers = []
--> 627 self._initialize(args, kwds, add_initializers_to=initializers)
628 finally:
629 # At this point we know that the initialization is complete (or less
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
504 self._concrete_stateful_fn = (
505 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 506 *args, **kwds))
507
508 def invalid_creator_scope(*unused_args, **unused_kwds):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
2444 args, kwargs = None, None
2445 with self._lock:
-> 2446 graph_function, _, _ = self._maybe_define_function(args, kwargs)
2447 return graph_function
2448
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
2775
2776 self._function_cache.missed.add(call_context_key)
-> 2777 graph_function = self._create_graph_function(args, kwargs)
2778 self._function_cache.primary[cache_key] = graph_function
2779 return graph_function, args, kwargs
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
2665 arg_names=arg_names,
2666 override_flat_arg_shapes=override_flat_arg_shapes,
-> 2667 capture_by_value=self._capture_by_value),
2668 self._function_attributes,
2669 # Tell the ConcreteFunction to clean up its graph once it goes out of
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
979 _, original_func = tf_decorator.unwrap(python_func)
980
--> 981 func_outputs = python_func(*func_args, **func_kwargs)
982
983 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
439 # __wrapped__ allows AutoGraph to swap in a converted function. We give
440 # the function a weak reference to itself to avoid a reference cycle.
--> 441 return weak_wrapped_fn().__wrapped__(*args, **kwds)
442 weak_wrapped_fn = weakref.ref(wrapped_fn)
443
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
966 except Exception as e: # pylint:disable=broad-except
967 if hasattr(e, "ag_error_metadata"):
--> 968 raise e.ag_error_metadata.to_exception(e)
969 else:
970 raise
ValueError: in user code:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:571 train_function *
outputs = self.distribute_strategy.run(
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:951 run **
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
return fn(*args, **kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py:533 train_step **
y, y_pred, sample_weight, regularization_losses=self.losses)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/compile_utils.py:204 __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/losses.py:143 __call__
losses = self.call(y_true, y_pred)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/losses.py:246 call
return self.fn(y_true, y_pred, **self._fn_kwargs)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/losses.py:1527 categorical_crossentropy
return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/backend.py:4561 categorical_crossentropy
target.shape.assert_is_compatible_with(output.shape)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with
raise ValueError("Shapes %s and %s are incompatible" % (self, other))
ValueError: Shapes (32, 1) and (32, 10) are incompatible
Any clues where these errors come from and what I am doing wrong? It feels like I might have missed something obvious, but being a Keras novice I can't get my head around what it is. Help much appreciated.
You need to comment two lines on expanding dims as follows. What happens is that it updates the shape of train_img to (1,60000,32,32,3) and model.fit complains that you are using single image for training.
#train_img = tf.expand_dims(train_img, axis=0)
#test_img = tf.expand_dims(test_img, axis=0)
I updated your code and shared Here. You need to update the architecture to improve it for better accuracy. Follow transfer learning approach mentioned here and update your code for better accuacy. Thanks!
Seems the issue was that I had a dense output layer of size 10, while the labels have size 1. Solution was to use sparse categorical cross-entropy loss function instead of simple categorical.

Could not create cudnn handle: CUDNN_STATUS_ALLOC_FAILED

HERE IS THE MODEL I AM USING:
#import tensorflow as tf
def create_model():
return tf.keras.models.Sequential([
#tf.keras.layers.Flatten(input_shape=(2,)),
tf.keras.layers.Conv2D(filters=32,kernel_size=(3,3),strides=(1,1),input_shape=(156,256,3),padding='valid',data_format='channels_last',
activation='relu',kernel_initializer=tf.keras.initializers.he_normal(seed=0),name='Conv1'),
tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2),padding='valid',data_format='channels_last',name='Pool1'),
tf.keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(1,1),padding='valid',data_format='channels_last',
activation='relu',kernel_initializer=tf.keras.initializers.he_normal(seed=3),name='Conv2'),
tf.keras.layers.Conv2D(filters=64,kernel_size=(3,3),strides=(2,2),padding='valid',data_format='channels_last',
activation='relu',kernel_initializer=tf.keras.initializers.he_normal(seed=5),name='Conv3'),
tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(1,1),padding='valid',data_format='channels_last',name='Pool2'),
tf.keras.layers.Conv2D(filters=128,kernel_size=(3,3),strides=(2,2),padding='valid',data_format='channels_last',
activation='relu',kernel_initializer=tf.keras.initializers.he_normal(seed=9),name='Conv4'),
tf.keras.layers.MaxPool2D(pool_size=(2,2),strides=(2,2),padding='valid',data_format='channels_last',name='Pool3'),
tf.keras.layers.Flatten(data_format='channels_last',name='Flatten'),
tf.keras.layers.Dense(units=30,activation='relu',kernel_initializer=tf.keras.initializers.glorot_normal(seed=32),name='FC1'),
tf.keras.layers.Dense(units=15,activation='relu',kernel_initializer=tf.keras.initializers.glorot_normal(seed=33),name='FC2'),
tf.keras.layers.Dense(units=8,activation='softmax',kernel_initializer=tf.keras.initializers.glorot_normal(seed=3),name='Output'),
])
HERE IS THE ERROR I AM GETTING :
UnknownError Traceback (most recent call last)
<ipython-input-47-264c0fcc37e1> in <module>
1 ##fitting generator
----> 2 model.fit_generator(ImageGenerator,steps_per_epoch=216,epochs=3)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1295 shuffle=shuffle,
1296 initial_epoch=initial_epoch,
-> 1297 steps_name='steps_per_epoch')
1298
1299 def evaluate_generator(self,
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_generator.py in model_iteration(model, data, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, validation_freq, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch, mode, batch_size, steps_name, **kwargs)
263
264 is_deferred = not model._is_compiled
--> 265 batch_outs = batch_function(*batch_data)
266 if not isinstance(batch_outs, list):
267 batch_outs = [batch_outs]
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training.py in train_on_batch(self, x, y, sample_weight, class_weight, reset_metrics)
971 outputs = training_v2_utils.train_on_batch(
972 self, x, y=y, sample_weight=sample_weight,
--> 973 class_weight=class_weight, reset_metrics=reset_metrics)
974 outputs = (outputs['total_loss'] + outputs['output_losses'] +
975 outputs['metrics'])
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_v2_utils.py in train_on_batch(model, x, y, sample_weight, class_weight, reset_metrics)
262 y,
263 sample_weights=sample_weights,
--> 264 output_loss_metrics=model._output_loss_metrics)
265
266 if reset_metrics:
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in train_on_batch(model, inputs, targets, sample_weights, output_loss_metrics)
309 sample_weights=sample_weights,
310 training=True,
--> 311 output_loss_metrics=output_loss_metrics))
312 if not isinstance(outs, list):
313 outs = [outs]
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in _process_single_batch(model, inputs, targets, output_loss_metrics, sample_weights, training)
250 output_loss_metrics=output_loss_metrics,
251 sample_weights=sample_weights,
--> 252 training=training))
253 if total_loss is None:
254 raise ValueError('The model cannot be run '
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\training_eager.py in _model_loss(model, inputs, targets, output_loss_metrics, sample_weights, training)
125 inputs = nest.map_structure(ops.convert_to_tensor, inputs)
126
--> 127 outs = model(inputs, **kwargs)
128 outs = nest.flatten(outs)
129
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
889 with base_layer_utils.autocast_context_manager(
890 self._compute_dtype):
--> 891 outputs = self.call(cast_inputs, *args, **kwargs)
892 self._handle_activity_regularization(inputs, outputs)
893 self._set_mask_metadata(inputs, outputs, input_masks)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\sequential.py in call(self, inputs, training, mask)
254 if not self.built:
255 self._init_graph_network(self.inputs, self.outputs, name=self.name)
--> 256 return super(Sequential, self).call(inputs, training=training, mask=mask)
257
258 outputs = inputs # handle the corner case where self.layers is empty
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\network.py in call(self, inputs, training, mask)
706 return self._run_internal_graph(
707 inputs, training=training, mask=mask,
--> 708 convert_kwargs_to_constants=base_layer_utils.call_context().saving)
709
710 def compute_output_shape(self, input_shape):
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\network.py in _run_internal_graph(self, inputs, training, mask, convert_kwargs_to_constants)
858
859 # Compute outputs.
--> 860 output_tensors = layer(computed_tensors, **kwargs)
861
862 # Update tensor_dict.
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
889 with base_layer_utils.autocast_context_manager(
890 self._compute_dtype):
--> 891 outputs = self.call(cast_inputs, *args, **kwargs)
892 self._handle_activity_regularization(inputs, outputs)
893 self._set_mask_metadata(inputs, outputs, input_masks)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\keras\layers\convolutional.py in call(self, inputs)
195
196 def call(self, inputs):
--> 197 outputs = self._convolution_op(inputs, self.kernel)
198
199 if self.use_bias:
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\nn_ops.py in __call__(self, inp, filter)
1132 call_from_convolution=False)
1133 else:
-> 1134 return self.conv_op(inp, filter)
1135 # copybara:strip_end
1136 # copybara:insert return self.conv_op(inp, filter)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\nn_ops.py in __call__(self, inp, filter)
637
638 def __call__(self, inp, filter): # pylint: disable=redefined-builtin
--> 639 return self.call(inp, filter)
640
641
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\nn_ops.py in __call__(self, inp, filter)
236 padding=self.padding,
237 data_format=self.data_format,
--> 238 name=self.name)
239
240
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\nn_ops.py in conv2d(input, filter, strides, padding, use_cudnn_on_gpu, data_format, dilations, name, filters)
2008 data_format=data_format,
2009 dilations=dilations,
-> 2010 name=name)
2011
2012
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\gen_nn_ops.py in conv2d(input, filter, strides, padding, use_cudnn_on_gpu, explicit_paddings, data_format, dilations, name)
1029 input, filter, strides=strides, use_cudnn_on_gpu=use_cudnn_on_gpu,
1030 padding=padding, explicit_paddings=explicit_paddings,
-> 1031 data_format=data_format, dilations=dilations, name=name, ctx=_ctx)
1032 except _core._SymbolicException:
1033 pass # Add nodes to the TensorFlow graph.
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\ops\gen_nn_ops.py in conv2d_eager_fallback(input, filter, strides, padding, use_cudnn_on_gpu, explicit_paddings, data_format, dilations, name, ctx)
1128 explicit_paddings, "data_format", data_format, "dilations", dilations)
1129 _result = _execute.execute(b"Conv2D", 1, inputs=_inputs_flat, attrs=_attrs,
-> 1130 ctx=_ctx, name=name)
1131 _execute.record_gradient(
1132 "Conv2D", _inputs_flat, _attrs, _result, name)
D:\anaconda\envs\tf_gpu\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
65 else:
66 message = e.message
---> 67 six.raise_from(core._status_to_exception(e.code, message), None)
68 except TypeError as e:
69 keras_symbolic_tensors = [
D:\anaconda\envs\tf_gpu\lib\site-packages\six.py in raise_from(value, from_value)
UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above. [Op:Conv2D]
I am using tensorflow 2.0 installed with anaconda cuda version 10.2.
Can anyone please help me with this same installation works fine when i am not using cnn.
Is it because of i am using CONV2d or is it because i am using generator ?
I am on a windows 10 machine with 16 gb ram and 4gb nvidia 1650 graphics card.
Got the same error and resolved by below:
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(gpus[0],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
(with GTX 1660, 6G memory)

CUDA runtime implicit initialization on GPU:0 failed. Status: unknown error

For ImageDataGenerator using keras library i use the following code.Number of training set is 8000 and number of test set is 2000.
classifier.fit_generator(
generator = training_set,
steps_per_epoch=8000,
validation_data = test_set,
validation_steps = 2000,
epochs=25)
But when i run the code i get the following errors.Is there any problem in my tensorflow version or keras version? Currently i am using keras version = 2.2.4, python=3.6, tensorflow version = 1.11.
InternalError Traceback (most recent call last)
<ipython-input-88-16fbb44d18e3> in <module>()
4 validation_data = test_set,
5 validation_steps = 2000,
----> 6 epochs=25)
~\Anaconda3\lib\site-packages\keras\legacy\interfaces.py in wrapper(*args, **kwargs)
89 warnings.warn('Update your `' + object_name + '` call to the ' +
90 'Keras 2 API: ' + signature, stacklevel=2)
---> 91 return func(*args, **kwargs)
92 wrapper._original_function = func
93 return wrapper
~\Anaconda3\lib\site-packages\keras\engine\training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
1416 use_multiprocessing=use_multiprocessing,
1417 shuffle=shuffle,
-> 1418 initial_epoch=initial_epoch)
1419
1420 #interfaces.legacy_generator_methods_support
~\Anaconda3\lib\site-packages\keras\engine\training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
215 outs = model.train_on_batch(x, y,
216 sample_weight=sample_weight,
--> 217 class_weight=class_weight)
218
219 outs = to_list(outs)
~\Anaconda3\lib\site-packages\keras\engine\training.py in train_on_batch(self, x, y, sample_weight, class_weight)
1215 ins = x + y + sample_weights
1216 self._make_train_function()
-> 1217 outputs = self.train_function(ins)
1218 return unpack_singleton(outputs)
1219
~\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py in __call__(self, inputs)
2695
2696 def __call__(self, inputs):
-> 2697 if hasattr(get_session(), '_make_callable_from_options'):
2698 if py_any(is_sparse(x) for x in self.inputs):
2699 if py_any(is_tensor(x) for x in inputs):
~\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py in get_session()
184 config = tf.ConfigProto(intra_op_parallelism_threads=num_thread,
185 allow_soft_placement=True)
--> 186 _SESSION = tf.Session(config=config)
187 session = _SESSION
188 if not _MANUAL_VAR_INIT:
~\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in __init__(self, target, graph, config)
1509
1510 """
-> 1511 super(Session, self).__init__(target, graph, config=config)
1512 # NOTE(mrry): Create these on first `__enter__` to avoid a reference cycle.
1513 self._default_graph_context_manager = None
~\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in __init__(self, target, graph, config)
632 try:
633 # pylint: disable=protected-access
--> 634 self._session = tf_session.TF_NewSessionRef(self._graph._c_graph, opts)
635 # pylint: enable=protected-access
636 finally:
InternalError: CUDA runtime implicit initialization on GPU:0 failed. Status: unknown error

Resources