I have recently started learning about Image Segmentation and UNet. I am trying to do a multi class Image Segmentation where I have 7 classes and input is a (256, 256, 3) rgb image and output is (256, 256, 1) grayscale image where each intensity value corresponds to one class. I am doing pixel wise softmax. I am using sparse categorical cross entropy so as to avoid doing One Hot Encoding.
def soft1(x):
return keras.activations.softmax(x, axis = -1)
def conv2d_block(input_tensor, n_filters, kernel_size = 3, batchnorm = True):
x = Conv2D(filters = n_filters, kernel_size = (kernel_size, kernel_size),\
kernel_initializer = 'he_normal', padding = 'same')(input_tensor)
if batchnorm:
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(filters = n_filters, kernel_size = (kernel_size, kernel_size),\
kernel_initializer = 'he_normal', padding = 'same')(input_tensor)
if batchnorm:
x = BatchNormalization()(x)
x = Activation('relu')(x)
return x
def get_unet(input_img, n_classes, n_filters = 16, dropout = 0.1, batchnorm = True):
# Contracting Path
c1 = conv2d_block(input_img, n_filters * 1, kernel_size = 3, batchnorm = batchnorm)
p1 = MaxPooling2D((2, 2))(c1)
p1 = Dropout(dropout)(p1)
c2 = conv2d_block(p1, n_filters * 2, kernel_size = 3, batchnorm = batchnorm)
p2 = MaxPooling2D((2, 2))(c2)
p2 = Dropout(dropout)(p2)
c3 = conv2d_block(p2, n_filters * 4, kernel_size = 3, batchnorm = batchnorm)
p3 = MaxPooling2D((2, 2))(c3)
p3 = Dropout(dropout)(p3)
c4 = conv2d_block(p3, n_filters * 8, kernel_size = 3, batchnorm = batchnorm)
p4 = MaxPooling2D((2, 2))(c4)
p4 = Dropout(dropout)(p4)
c5 = conv2d_block(p4, n_filters = n_filters * 16, kernel_size = 3, batchnorm = batchnorm)
# Expansive Path
u6 = Conv2DTranspose(n_filters * 8, (3, 3), strides = (2, 2), padding = 'same')(c5)
u6 = concatenate([u6, c4])
u6 = Dropout(dropout)(u6)
c6 = conv2d_block(u6, n_filters * 8, kernel_size = 3, batchnorm = batchnorm)
u7 = Conv2DTranspose(n_filters * 4, (3, 3), strides = (2, 2), padding = 'same')(c6)
u7 = concatenate([u7, c3])
u7 = Dropout(dropout)(u7)
c7 = conv2d_block(u7, n_filters * 4, kernel_size = 3, batchnorm = batchnorm)
u8 = Conv2DTranspose(n_filters * 2, (3, 3), strides = (2, 2), padding = 'same')(c7)
u8 = concatenate([u8, c2])
u8 = Dropout(dropout)(u8)
c8 = conv2d_block(u8, n_filters * 2, kernel_size = 3, batchnorm = batchnorm)
u9 = Conv2DTranspose(n_filters * 1, (3, 3), strides = (2, 2), padding = 'same')(c8)
u9 = concatenate([u9, c1])
u9 = Dropout(dropout)(u9)
c9 = conv2d_block(u9, n_filters * 1, kernel_size = 3, batchnorm = batchnorm)
outputs = Conv2D(n_classes, (1, 1))(c9)
outputs = Reshape((image_height*image_width, 1, n_classes), input_shape = (image_height, image_width, n_classes))(outputs)
outputs = Activation(soft1)(outputs)
model = Model(inputs=[input_img], outputs=[outputs])
return model
My Model Summary is:
Model: "model_2"
Layer (type) Output Shape Param # Connected to
input_12 (InputLayer) (None, 256, 256, 3) 0
conv2d_211 (Conv2D) (None, 256, 256, 16) 448 input_12[0][0]
batch_normalization_200 (BatchN (None, 256, 256, 16) 64 conv2d_211[0][0]
activation_204 (Activation) (None, 256, 256, 16) 0 batch_normalization_200[0][0]
max_pooling2d_45 (MaxPooling2D) (None, 128, 128, 16) 0 activation_204[0][0]
dropout_89 (Dropout) (None, 128, 128, 16) 0 max_pooling2d_45[0][0]
conv2d_213 (Conv2D) (None, 128, 128, 32) 4640 dropout_89[0][0]
batch_normalization_202 (BatchN (None, 128, 128, 32) 128 conv2d_213[0][0]
activation_206 (Activation) (None, 128, 128, 32) 0 batch_normalization_202[0][0]
max_pooling2d_46 (MaxPooling2D) (None, 64, 64, 32) 0 activation_206[0][0]
dropout_90 (Dropout) (None, 64, 64, 32) 0 max_pooling2d_46[0][0]
conv2d_215 (Conv2D) (None, 64, 64, 64) 18496 dropout_90[0][0]
batch_normalization_204 (BatchN (None, 64, 64, 64) 256 conv2d_215[0][0]
activation_208 (Activation) (None, 64, 64, 64) 0 batch_normalization_204[0][0]
max_pooling2d_47 (MaxPooling2D) (None, 32, 32, 64) 0 activation_208[0][0]
dropout_91 (Dropout) (None, 32, 32, 64) 0 max_pooling2d_47[0][0]
conv2d_217 (Conv2D) (None, 32, 32, 128) 73856 dropout_91[0][0]
batch_normalization_206 (BatchN (None, 32, 32, 128) 512 conv2d_217[0][0]
activation_210 (Activation) (None, 32, 32, 128) 0 batch_normalization_206[0][0]
max_pooling2d_48 (MaxPooling2D) (None, 16, 16, 128) 0 activation_210[0][0]
dropout_92 (Dropout) (None, 16, 16, 128) 0 max_pooling2d_48[0][0]
conv2d_219 (Conv2D) (None, 16, 16, 256) 295168 dropout_92[0][0]
batch_normalization_208 (BatchN (None, 16, 16, 256) 1024 conv2d_219[0][0]
activation_212 (Activation) (None, 16, 16, 256) 0 batch_normalization_208[0][0]
conv2d_transpose_45 (Conv2DTran (None, 32, 32, 128) 295040 activation_212[0][0]
concatenate_45 (Concatenate) (None, 32, 32, 256) 0 conv2d_transpose_45[0][0]
dropout_93 (Dropout) (None, 32, 32, 256) 0 concatenate_45[0][0]
conv2d_221 (Conv2D) (None, 32, 32, 128) 295040 dropout_93[0][0]
batch_normalization_210 (BatchN (None, 32, 32, 128) 512 conv2d_221[0][0]
activation_214 (Activation) (None, 32, 32, 128) 0 batch_normalization_210[0][0]
conv2d_transpose_46 (Conv2DTran (None, 64, 64, 64) 73792 activation_214[0][0]
concatenate_46 (Concatenate) (None, 64, 64, 128) 0 conv2d_transpose_46[0][0]
dropout_94 (Dropout) (None, 64, 64, 128) 0 concatenate_46[0][0]
conv2d_223 (Conv2D) (None, 64, 64, 64) 73792 dropout_94[0][0]
batch_normalization_212 (BatchN (None, 64, 64, 64) 256 conv2d_223[0][0]
activation_216 (Activation) (None, 64, 64, 64) 0 batch_normalization_212[0][0]
conv2d_transpose_47 (Conv2DTran (None, 128, 128, 32) 18464 activation_216[0][0]
concatenate_47 (Concatenate) (None, 128, 128, 64) 0 conv2d_transpose_47[0][0]
dropout_95 (Dropout) (None, 128, 128, 64) 0 concatenate_47[0][0]
conv2d_225 (Conv2D) (None, 128, 128, 32) 18464 dropout_95[0][0]
batch_normalization_214 (BatchN (None, 128, 128, 32) 128 conv2d_225[0][0]
activation_218 (Activation) (None, 128, 128, 32) 0 batch_normalization_214[0][0]
conv2d_transpose_48 (Conv2DTran (None, 256, 256, 16) 4624 activation_218[0][0]
concatenate_48 (Concatenate) (None, 256, 256, 32) 0 conv2d_transpose_48[0][0]
dropout_96 (Dropout) (None, 256, 256, 32) 0 concatenate_48[0][0]
conv2d_227 (Conv2D) (None, 256, 256, 16) 4624 dropout_96[0][0]
batch_normalization_216 (BatchN (None, 256, 256, 16) 64 conv2d_227[0][0]
activation_220 (Activation) (None, 256, 256, 16) 0 batch_normalization_216[0][0]
conv2d_228 (Conv2D) (None, 256, 256, 7) 119 activation_220[0][0]
reshape_12 (Reshape) (None, 65536, 1, 7) 0 conv2d_228[0][0]
activation_221 (Activation) (None, 65536, 1, 7) 0 reshape_12[0][0]
Total params: 1,179,511
Trainable params: 1,178,039
Non-trainable params: 1,472
Is my model right? Shouldn't the final output be (65536, 1, 1) as I am using softmax?
The code is compiling but dice coefficient is very low.

Your model should end in (256,256,7).
That is 7 classes per pixel, and the shape should agree with your output images that are (256,256,1). This will work only for 'sparse_categorical_crossentropy' or a custom loss.
So, up to conv_228 the model seems fine (didn't look in detail, though).
There is no need for anything that comes after this convolution.
You can place the softmax directly in the conv_228 or directly after.
y_train should be (256,256,1) for this.

Your output in fact represents its pixel of your image. For its pixel, you have as an output of 1x7. Since it is sigmoid the values that this representation takes are between 0-1. Therefore the output fires when you have the desired class and therefore segmentation. If it was (65536, 1, 1) you should have not categorical but dense representation.


Number of feature maps do not increase after each block of 3D Resnet

I am trying to build 3D Resnet for small 3D patches of size [32,32,44] with one channel. In 2D Resnet, after each residual block, the size of images should reduce to half and the number of feature maps doubles as shown below
# function for creating an identity or projection residual module
def residual_module(layer_in, n_filters):
merge_input = layer_in
# check if the number of filters needs to be increase, assumes channels last format
if layer_in.shape[-1] != n_filters:
merge_input = Conv2D(n_filters, (1,1), padding='same', activation='relu', kernel_initializer='he_normal')(layer_in)
# conv1
conv1 = Conv2D(n_filters, (3,3), padding='same', activation='relu', kernel_initializer='he_normal')(layer_in)
# conv2
conv2 = Conv2D(n_filters, (3,3), padding='same', activation='linear', kernel_initializer='he_normal')(conv1)
# add filters, assumes filters/channels last
layer_out = add([conv2, merge_input])
# activation function
layer_out = Activation('relu')(layer_out)
return layer_out
# define model input
visible = Input(shape=(256, 256, 1))
layer = residual_module(visible,64)
layer_1 = residual_module(layer,128)
# create model
model = Model(inputs=visible, outputs=layer_1)
# summarize model
Model: "model_44"
Layer (type) Output Shape Param # Connected to
input_68 (InputLayer) [(None, 256, 256, 1) 0
conv2d_40 (Conv2D) (None, 256, 256, 64) 640 input_68[0][0]
conv2d_41 (Conv2D) (None, 256, 256, 64) 36928 conv2d_40[0][0]
conv2d_39 (Conv2D) (None, 256, 256, 64) 128 input_68[0][0]
add_207 (Add) (None, 256, 256, 64) 0 conv2d_41[0][0]
activation_52 (Activation) (None, 256, 256, 64) 0 add_207[0][0]
conv2d_43 (Conv2D) (None, 256, 256, 128 73856 activation_52[0][0]
conv2d_44 (Conv2D) (None, 256, 256, 128 147584 conv2d_43[0][0]
conv2d_42 (Conv2D) (None, 256, 256, 128 8320 activation_52[0][0]
add_208 (Add) (None, 256, 256, 128 0 conv2d_44[0][0]
activation_53 (Activation) (None, 256, 256, 128 0 add_208[0][0]
Total params: 267,456
Trainable params: 267,456
Non-trainable params: 0
However, adapting this code for 3D Resnet does not double the number of feature maps. As it can be seen in the below example that after first residual block channel dimension is still 1 and changes to 3 in the second block
def residual_module(layer_in, n_filters):
merge_input = layer_in
# check if the number of filters needs to be increase, assumes channels last format
if layer_in.shape[-1] != n_filters:
merge_input = Conv3D(n_filters, (1,1,1), padding='same', activation='relu', kernel_initializer='he_normal')(layer_in)
# conv1
conv1 = Conv3D(n_filters, (3,3,3), padding='same', activation='relu', kernel_initializer='he_normal')(layer_in)
# conv2
conv2 = Conv3D(n_filters, (3,3,3), padding='same', activation='linear', kernel_initializer='he_normal')(conv1)
# add filters, assumes filters/channels last
layer_out = add([conv2, merge_input])
# activation function
layer_out = Activation('relu')(layer_out)
return layer_out
# define model input
visible = Input(shape=(32,32,32,1))
layer = residual_module(visible,16)
layer_1 = residual_module(layer,32)
# create model
model = Model(inputs=visible, outputs=layer_1)
# summarize model
Model: "model_45"
Layer (type) Output Shape Param # Connected to
input_69 (InputLayer) [(None, 32, 32, 32, 0
conv3d_519 (Conv3D) (None, 32, 32, 32, 1 448 input_69[0][0]
conv3d_520 (Conv3D) (None, 32, 32, 32, 1 6928 conv3d_519[0][0]
conv3d_518 (Conv3D) (None, 32, 32, 32, 1 32 input_69[0][0]
add_209 (Add) (None, 32, 32, 32, 1 0 conv3d_520[0][0]
activation_54 (Activation) (None, 32, 32, 32, 1 0 add_209[0][0]
conv3d_522 (Conv3D) (None, 32, 32, 32, 3 13856 activation_54[0][0]
conv3d_523 (Conv3D) (None, 32, 32, 32, 3 27680 conv3d_522[0][0]
conv3d_521 (Conv3D) (None, 32, 32, 32, 3 544 activation_54[0][0]
add_210 (Add) (None, 32, 32, 32, 3 0 conv3d_523[0][0]
activation_55 (Activation) (None, 32, 32, 32, 3 0 add_210[0][0]
Total params: 49,488
Trainable params: 49,488
Non-trainable params: 0
What am I missing here?
Well, I figured out that the code is okay, except that the lines in model.summary() were truncated, so in fact feature maps in the first residual block are 16 and in the second they are 32. Increasing line_width was the catch
Model: "model_8"
Layer (type) Output Shape Param # Connected to
input_9 (InputLayer) [(None, 32, 32, 32, 1)] 0
conv3d_136 (Conv3D) (None, 32, 32, 32, 16) 448 input_9[0][0]
conv3d_137 (Conv3D) (None, 32, 32, 32, 16) 6928 conv3d_136[0][0]
conv3d_135 (Conv3D) (None, 32, 32, 32, 16) 32 input_9[0][0]
add_57 (Add) (None, 32, 32, 32, 16) 0 conv3d_137[0][0]
activation_113 (Activation) (None, 32, 32, 32, 16) 0 add_57[0][0]
conv3d_139 (Conv3D) (None, 32, 32, 32, 32) 13856 activation_113[0][0]
conv3d_140 (Conv3D) (None, 32, 32, 32, 32) 27680 conv3d_139[0][0]
conv3d_138 (Conv3D) (None, 32, 32, 32, 32) 544 activation_113[0][0]
add_58 (Add) (None, 32, 32, 32, 32) 0 conv3d_140[0][0]
activation_114 (Activation) (None, 32, 32, 32, 32) 0 add_58[0][0]
Total params: 49,488
Trainable params: 49,488
Non-trainable params: 0

Keras Pretrained ResNet 101 V2: How to get filter size used?

I am using keras' pretrained resnet 101 v2 CNN model. I wanted to know what the size of the filter was. I tried checking my model's summary but it doesn't really tell me the size directly. is it a 2x2x2 matrix or a 3x3x3 or something else?
The snippet of the model summary is:
Layer (type) Output Shape Param # Connected to
input_3 (InputLayer) [(None, 255, 255, 3) 0
conv1_pad (ZeroPadding2D) (None, 261, 261, 3) 0 input_3[0][0]
conv1_conv (Conv2D) (None, 128, 128, 64) 9472 conv1_pad[0][0]
pool1_pad (ZeroPadding2D) (None, 130, 130, 64) 0 conv1_conv[0][0]
pool1_pool (MaxPooling2D) (None, 64, 64, 64) 0 pool1_pad[0][0]
conv2_block1_preact_bn (BatchNo (None, 64, 64, 64) 256 pool1_pool[0][0]
conv2_block1_preact_relu (Activ (None, 64, 64, 64) 0 conv2_block1_preact_bn[0][0]
conv2_block1_1_conv (Conv2D) (None, 64, 64, 64) 4096 conv2_block1_preact_relu[0][0]
conv2_block1_1_bn (BatchNormali (None, 64, 64, 64) 256 conv2_block1_1_conv[0][0]
conv2_block1_1_relu (Activation (None, 64, 64, 64) 0 conv2_block1_1_bn[0][0]
conv2_block1_2_pad (ZeroPadding (None, 66, 66, 64) 0 conv2_block1_1_relu[0][0]
conv2_block1_2_conv (Conv2D) (None, 64, 64, 64) 36864 conv2_block1_2_pad[0][0]
conv2_block1_2_bn (BatchNormali (None, 64, 64, 64) 256 conv2_block1_2_conv[0][0]
conv2_block1_2_relu (Activation (None, 64, 64, 64) 0 conv2_block1_2_bn[0][0]
conv2_block1_0_conv (Conv2D) (None, 64, 64, 256) 16640 conv2_block1_preact_relu[0][0]
conv2_block1_3_conv (Conv2D) (None, 64, 64, 256) 16640 conv2_block1_2_relu[0][0]
I am not sure if there is a predefined method to get this. It should be possible to get filter shape, count for a layer this way for a certain layer,
This gives output,
(7, 7, 3, 64)
Printing print(model.layers[2].weights) gives something like,
[<tf.Variable 'conv1_conv/kernel:0' shape=(7, 7, 3, 64) dtype=float32, numpy=
array([[[[ 2.04881709e-02, 1.74432080e-02, -1.19661177e-02, ...,
To get details for all the layers,
for i, layer in enumerate(model.layers):
if layer.weights:
print('-' * 30)
Partial output,
(7, 7, 3, 64)
(1, 1, 64, 64)
(3, 3, 64, 64)

Keras multiple output expected shape and got shape

I am try to train a model which detect 128d vector to recognize face. Input of model is an image and output is 128d vector (regression) which get from "face_recognition" library.
When I put 128 output to train I got this error:
ValueError: Error when checking target: expected dense_24 to have shape (1,) but got array with shape (128,)
But when I try only one output, fit function works.
The strange part of that prediction shape is (1, 128) but I can't give 128 output to train.
Here is my model:
from keras.applications.vgg16 import VGG16
from keras.layers import Flatten, Dense
import keras
def build_facereg_disc():
# load model
model = VGG16(include_top=False, input_shape=(64, 64, 3))
# add new classifier layers
flat1 = Flatten()(model.outputs)
class1 = Dense(2048, activation='relu')(flat1)
output = Dense(128, activation='relu')(class1)
# define new model
model = models.Model(inputs=model.inputs, outputs=output)
# summarize
return model
facereg_disc = build_facereg_disc()
facereg_disc.compile(optimizer=keras.optimizers.Adam(), # Optimizer
# Loss function to minimize
# List of metrics to monitor
And summary:
Model: "model_27"
Layer (type) Output Shape Param #
input_20 (InputLayer) (None, 64, 64, 3) 0
block1_conv1 (Conv2D) (None, 64, 64, 64) 1792
block1_conv2 (Conv2D) (None, 64, 64, 64) 36928
block1_pool (MaxPooling2D) (None, 32, 32, 64) 0
block2_conv1 (Conv2D) (None, 32, 32, 128) 73856
block2_conv2 (Conv2D) (None, 32, 32, 128) 147584
block2_pool (MaxPooling2D) (None, 16, 16, 128) 0
block3_conv1 (Conv2D) (None, 16, 16, 256) 295168
block3_conv2 (Conv2D) (None, 16, 16, 256) 590080
block3_conv3 (Conv2D) (None, 16, 16, 256) 590080
block3_pool (MaxPooling2D) (None, 8, 8, 256) 0
block4_conv1 (Conv2D) (None, 8, 8, 512) 1180160
block4_conv2 (Conv2D) (None, 8, 8, 512) 2359808
block4_conv3 (Conv2D) (None, 8, 8, 512) 2359808
block4_pool (MaxPooling2D) (None, 4, 4, 512) 0
block5_conv1 (Conv2D) (None, 4, 4, 512) 2359808
block5_conv2 (Conv2D) (None, 4, 4, 512) 2359808
block5_conv3 (Conv2D) (None, 4, 4, 512) 2359808
block5_pool (MaxPooling2D) (None, 2, 2, 512) 0
flatten_10 (Flatten) (None, 2048) 0
dense_23 (Dense) (None, 2048) 4196352
dense_24 (Dense) (None, 128) 262272
Total params: 19,173,312
Trainable params: 19,173,312
Non-trainable params: 0
Here is preprocessing:
dir_data = "data_faces/img_align_celeba/"
Ntrain = 2000
Ntest = 100
nm_imgs = np.sort(os.listdir(dir_data))
## name of the jpg files for training set
nm_imgs_train = nm_imgs[:Ntrain]
## name of the jpg files for the testing data
nm_imgs_test = nm_imgs[Ntrain:Ntrain + Ntest]
img_shape = (64, 64, 3)
def get_npdata(nm_imgs_train):
X_train = []
for i, myid in enumerate(nm_imgs_train):
image = load_img(dir_data + "/" + myid,
image = img_to_array(image)/255.0
X_train = np.array(X_train)
X_train = get_npdata(nm_imgs_train)
X_train.shape = (2000, 64, 64, 3)
y_train.shape = (2000, 128)
I use batch size like:
idx = np.random.randint(0, X_train.shape[0], half_batch)
imgs = X_train[idx]
labels = y_train[idx]
reg_d_loss_real = facereg_disc.train_on_batch(imgs, labels)
Your issue comes from your loss function. As explained in the doc, SparseCategoricalCrossentropy expects each sample in y_true to be an integer encoding the class, whereas CategoricalCrossentropy expects a one-hot encoded representation (which is your case).
So, switch to CategoricalCrossentropy and you should be fine.
However, to reproduce, I had to change:
flat1 = Flatten()(model.outputs)
flat1 = Flatten()(model.outputs[0])

Training a unet model , but model is not learning

I am trying to train a segmentation model, But loss saturates at 0.3370 , i am really not sure what to do, can someone please help
This is the model
def unet(input_shape=(128, 128, 128), optimizer=Adam, initial_learning_rate=5e-4,
inputs = Input(shape=input_shape)
conv1 = UnetConv3D(inputs, 32, is_batchnorm=False, name='conv1')
pool1 = MaxPooling3D(pool_size=(2, 2,2 ))(conv1)
conv2 = UnetConv3D(pool1, 64, is_batchnorm=False, name='conv2')
pool2 = MaxPooling3D(pool_size=(2, 2,2 ))(conv2)
conv3 = UnetConv3D(pool2, 128, is_batchnorm=False, name='conv3')
pool3 = MaxPooling3D(pool_size=(2, 2,2 ))(conv3)
conv4 = UnetConv3D(pool3, 256, is_batchnorm=False, name='conv4')
pool4 = MaxPooling3D(pool_size=(2, 2,2 ))(conv4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', kernel_initializer=kinit, padding='same', data_format = 'channels_first')(pool4)
conv5 = Conv3D(512, (3, 3, 3), activation='relu', kernel_initializer=kinit, padding='same', data_format = 'channels_first')(conv5)
up6 = concatenate([Conv3DTranspose(256, (2, 2,2 ), strides=(2, 2,2 ), kernel_initializer=kinit, padding='same', data_format = 'channels_first')(conv5), conv4], axis=1)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same', data_format = 'channels_first')(up6)
conv6 = Conv3D(256, (3, 3, 3), activation='relu', padding='same', data_format = 'channels_first')(conv6)
up7 = concatenate([Conv3DTranspose(128, (2, 2,2 ), strides=(2, 2,2 ), padding='same', data_format = 'channels_first')(conv6), conv3], axis=1)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', kernel_initializer=kinit, padding='same', data_format = 'channels_first')(up7)
conv7 = Conv3D(128, (3, 3, 3), activation='relu', kernel_initializer=kinit, padding='same', data_format = 'channels_first')(conv7)
up8 = concatenate([Conv3DTranspose(64, (2, 2,2 ), strides=(2,2,2 ), kernel_initializer=kinit, padding='same', data_format = 'channels_first')(conv7), conv2], axis=1)
conv8 = Conv3D(64, (3, 3, 3), activation='relu', kernel_initializer=kinit, padding='same', data_format = 'channels_first')(up8)
up9 = concatenate([Conv3DTranspose(32, (2, 2,2 ), strides=(2, 2,2 ), kernel_initializer=kinit, padding='same', data_format = 'channels_first')(conv8), conv1], axis=1)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', kernel_initializer=kinit, padding='same', data_format = 'channels_first')(up9)
conv9 = Conv3D(32, (3, 3, 3), activation='relu', kernel_initializer=kinit, padding='same', data_format = 'channels_first')(conv9)
conv10 = Conv3D(3, (1, 1, 1), activation='relu', kernel_initializer=kinit,padding = 'same', name='final', data_format = 'channels_first')(conv9)
activation_name = 'sigmoid'
activation_block = Activation(activation_name)(conv10)
model = Model(inputs=[inputs], outputs=[activation_block])
model.compile(optimizer=optimizer(), loss=loss_function)
return model
This is the helper function
def UnetConv3D(input, outdim, is_batchnorm, name):
x = Conv3D(outdim, (3, 3, 3), strides=(1, 1, 1), kernel_initializer=kinit, padding="same", name=name+'_1', data_format = 'channels_first')(input)
if is_batchnorm:
x =BatchNormalization(name=name + '_1_bn')(x)
x = Activation('relu',name=name + '_1_act')(x)
x = Conv3D(outdim, (3, 3, 3), strides=(1, 1, 1), kernel_initializer=kinit, padding="same", name=name+'_2', data_format = 'channels_first')(x)
if is_batchnorm:
x = BatchNormalization(name=name + '_2_bn')(x)
x = Activation('relu', name=name + '_2_act')(x)
return x
And this is the loss function --
def weighted_dice_coefficient(y_true, y_pred, axis=(-3, -2, -1), smooth=0.00001):
Weighted dice coefficient. Default axis assumes a "channels first" data structure
:param smooth:
:param y_true:
:param y_pred:
:param axis:
return K.mean(2. * (K.sum(y_true * y_pred,
axis=axis) + smooth/2)/(K.sum(y_true,
axis=axis) + K.sum(y_pred,
axis=axis) + smooth))
My input is (128,128,128), am i doing an obvious mistake? Please let me know if more info needed.
Model summary
Layer (type) Output Shape Param # Connected to
input_1 (InputLayer) (None, 1, 128, 128, 0
conv1_1 (Conv3D) (None, 32, 128, 128, 896 input_1[0][0]
conv1_1_act (Activation) (None, 32, 128, 128, 0 conv1_1[0][0]
conv1_2 (Conv3D) (None, 32, 128, 128, 27680 conv1_1_act[0][0]
conv1_2_act (Activation) (None, 32, 128, 128, 0 conv1_2[0][0]
max_pooling3d_1 (MaxPooling3D) (None, 32, 64, 64, 6 0 conv1_2_act[0][0]
conv2_1 (Conv3D) (None, 64, 64, 64, 6 55360 max_pooling3d_1[0][0]
conv2_1_act (Activation) (None, 64, 64, 64, 6 0 conv2_1[0][0]
conv2_2 (Conv3D) (None, 64, 64, 64, 6 110656 conv2_1_act[0][0]
conv2_2_act (Activation) (None, 64, 64, 64, 6 0 conv2_2[0][0]
max_pooling3d_2 (MaxPooling3D) (None, 64, 32, 32, 3 0 conv2_2_act[0][0]
conv3_1 (Conv3D) (None, 128, 32, 32, 221312 max_pooling3d_2[0][0]
conv3_1_act (Activation) (None, 128, 32, 32, 0 conv3_1[0][0]
conv3_2 (Conv3D) (None, 128, 32, 32, 442496 conv3_1_act[0][0]
conv3_2_act (Activation) (None, 128, 32, 32, 0 conv3_2[0][0]
max_pooling3d_3 (MaxPooling3D) (None, 128, 16, 16, 0 conv3_2_act[0][0]
conv4_1 (Conv3D) (None, 256, 16, 16, 884992 max_pooling3d_3[0][0]
conv4_1_act (Activation) (None, 256, 16, 16, 0 conv4_1[0][0]
conv4_2 (Conv3D) (None, 256, 16, 16, 1769728 conv4_1_act[0][0]
conv4_2_act (Activation) (None, 256, 16, 16, 0 conv4_2[0][0]
max_pooling3d_4 (MaxPooling3D) (None, 256, 8, 8, 8) 0 conv4_2_act[0][0]
conv3d_1 (Conv3D) (None, 512, 8, 8, 8) 3539456 max_pooling3d_4[0][0]
conv3d_2 (Conv3D) (None, 512, 8, 8, 8) 7078400 conv3d_1[0][0]
conv3d_transpose_1 (Conv3DTrans (None, 256, 16, 16, 1048832 conv3d_2[0][0]
concatenate_1 (Concatenate) (None, 512, 16, 16, 0 conv3d_transpose_1[0][0]
conv3d_3 (Conv3D) (None, 256, 16, 16, 3539200 concatenate_1[0][0]
conv3d_4 (Conv3D) (None, 256, 16, 16, 1769728 conv3d_3[0][0]
conv3d_transpose_2 (Conv3DTrans (None, 128, 32, 32, 262272 conv3d_4[0][0]
concatenate_2 (Concatenate) (None, 256, 32, 32, 0 conv3d_transpose_2[0][0]
conv3d_5 (Conv3D) (None, 128, 32, 32, 884864 concatenate_2[0][0]
conv3d_6 (Conv3D) (None, 128, 32, 32, 442496 conv3d_5[0][0]
conv3d_transpose_3 (Conv3DTrans (None, 64, 64, 64, 6 65600 conv3d_6[0][0]
concatenate_3 (Concatenate) (None, 128, 64, 64, 0 conv3d_transpose_3[0][0]
conv3d_7 (Conv3D) (None, 64, 64, 64, 6 221248 concatenate_3[0][0]
conv3d_transpose_4 (Conv3DTrans (None, 32, 128, 128, 16416 conv3d_7[0][0]
concatenate_4 (Concatenate) (None, 64, 128, 128, 0 conv3d_transpose_4[0][0]
conv3d_8 (Conv3D) (None, 32, 128, 128, 55328 concatenate_4[0][0]
conv3d_9 (Conv3D) (None, 32, 128, 128, 27680 conv3d_8[0][0]
final (Conv3D) (None, 3, 128, 128, 99 conv3d_9[0][0]
activation_1 (Activation) (None, 3, 128, 128, 0 final[0][0]
Thanks in advance

building a u-net model for multi-class semantic segmenation

I'm trying to build u-net in keras for multi-class semantic segmentation. The model I have below does not learn anything. It always just predicts the background (first) class.
Is my use of the final 'softmax' layer correct? The documentation shows a axis parameter, but I'm not sure how to set that or what it should be.
def unet(input_shape=(572, 572, 1), classes=2):
input_image = KL.Input(shape=input_shape)
contracting_1, pooled_1 = blocks.contracting(input_image, filters=64, block_name="block1")
contracting_2, pooled_2 = blocks.contracting(pooled_1, filters=128, block_name="block2")
contracting_3, pooled_3 = blocks.contracting(pooled_2, filters=256, block_name="block3")
contracting_4, pooled_4 = blocks.contracting(pooled_3, filters=512, block_name="block4")
contracting_5, _ = blocks.contracting(pooled_4, filters=1024, block_name="block5")
dropout = KL.Dropout(rate=0.5)(contracting_5)
expanding_1 = blocks.expanding(dropout, merge_layer=contracting_4, filters=512, block_name="block6")
expanding_2 = blocks.expanding(expanding_1, merge_layer=contracting_3, filters=256, block_name="block7")
expanding_3 = blocks.expanding(expanding_2, merge_layer=contracting_2, filters=128, block_name="block8")
expanding_4 = blocks.expanding(expanding_3, merge_layer=contracting_1, filters=64, block_name="block9")
class_output = KL.Conv2D(classes, kernel_size=(1, 1), activation='softmax', name='class_output')(expanding_4)
model = KM.Model(inputs=[input_image], outputs=[class_output])
return model
def contracting(input_layer, filters, kernel_size=(3, 3), padding='same',
conv_a = KL.Conv2D(filters, kernel_size, activation='relu', padding=padding,
conv_b = KL.Conv2D(filters, kernel_size, activation='relu', padding=padding,
pool = KL.MaxPooling2D(pool_size=(2, 2), padding=padding,
batch_normalization = KL.BatchNormalization()(pool)
return conv_b, batch_normalization
def expanding(input_layer, merge_layer, filters, kernel_size=(3, 3), padding='same',
input_layer = KL.UpSampling2D(size=(2, 2))(input_layer)
conv_up = KL.Conv2D(filters, kernel_size=(2, 2), activation='relu',
padding='same', name='{}_expanding_conv_up'.format(block_name))(input_layer)
conv_up_height, conv_up_width = int(conv_up.shape[1]), int(conv_up.shape[2])
merge_height, merge_width = int(merge_layer.shape[1]), int(merge_layer.shape[2])
crop_top = (merge_height - conv_up_height) // 2
crop_bottom = (merge_height - conv_up_height) - crop_top
crop_left = (merge_width - conv_up_width) // 2
crop_right = (merge_width - conv_up_width) - crop_left
cropping = ((crop_top, crop_bottom), (crop_left, crop_right))
merge_layer = KL.Cropping2D(cropping)(merge_layer)
merged = KL.concatenate([merge_layer, conv_up])
conv_a = KL.Conv2D(filters, kernel_size, activation='relu', padding=padding,
conv_b = KL.Conv2D(filters, kernel_size, activation='relu', padding=padding,
batch_normalization = KL.BatchNormalization()(conv_b)
return batch_normalization
optimizer = keras.optimizers.SGD(lr=0.0001, momentum=0.9)
loss = keras.losses.categorical_crossentropy
metrics = [keras.metrics.categorical_accuracy]
model.compile(optimizer, loss, metrics)
Model Summary:
Layer (type) Output Shape Param # Connected to
input_2 (InputLayer) (None, 96, 96, 3) 0
block1_contracting_conv_a (Conv (None, 96, 96, 64) 1792 input_2[0][0]
block1_contracting_conv_b (Conv (None, 96, 96, 64) 36928 block1_contracting_conv_a[0][0]
block1_contracting_pool (MaxPoo (None, 48, 48, 64) 0 block1_contracting_conv_b[0][0]
batch_normalization_10 (BatchNo (None, 48, 48, 64) 256 block1_contracting_pool[0][0]
block2_contracting_conv_a (Conv (None, 48, 48, 128) 73856 batch_normalization_10[0][0]
block2_contracting_conv_b (Conv (None, 48, 48, 128) 147584 block2_contracting_conv_a[0][0]
block2_contracting_pool (MaxPoo (None, 24, 24, 128) 0 block2_contracting_conv_b[0][0]
batch_normalization_11 (BatchNo (None, 24, 24, 128) 512 block2_contracting_pool[0][0]
block3_contracting_conv_a (Conv (None, 24, 24, 256) 295168 batch_normalization_11[0][0]
block3_contracting_conv_b (Conv (None, 24, 24, 256) 590080 block3_contracting_conv_a[0][0]
block3_contracting_pool (MaxPoo (None, 12, 12, 256) 0 block3_contracting_conv_b[0][0]
batch_normalization_12 (BatchNo (None, 12, 12, 256) 1024 block3_contracting_pool[0][0]
block4_contracting_conv_a (Conv (None, 12, 12, 512) 1180160 batch_normalization_12[0][0]
block4_contracting_conv_b (Conv (None, 12, 12, 512) 2359808 block4_contracting_conv_a[0][0]
block4_contracting_pool (MaxPoo (None, 6, 6, 512) 0 block4_contracting_conv_b[0][0]
batch_normalization_13 (BatchNo (None, 6, 6, 512) 2048 block4_contracting_pool[0][0]
block5_contracting_conv_a (Conv (None, 6, 6, 1024) 4719616 batch_normalization_13[0][0]
block5_contracting_conv_b (Conv (None, 6, 6, 1024) 9438208 block5_contracting_conv_a[0][0]
dropout_2 (Dropout) (None, 6, 6, 1024) 0 block5_contracting_conv_b[0][0]
up_sampling2d_5 (UpSampling2D) (None, 12, 12, 1024) 0 dropout_2[0][0]
cropping2d_5 (Cropping2D) (None, 12, 12, 512) 0 block4_contracting_conv_b[0][0]
block6_expanding_conv_up (Conv2 (None, 12, 12, 512) 2097664 up_sampling2d_5[0][0]
concatenate_5 (Concatenate) (None, 12, 12, 1024) 0 cropping2d_5[0][0]
block6_expanding_conv_a (Conv2D (None, 12, 12, 512) 4719104 concatenate_5[0][0]
block6_expanding_conv_b (Conv2D (None, 12, 12, 512) 2359808 block6_expanding_conv_a[0][0]
batch_normalization_15 (BatchNo (None, 12, 12, 512) 2048 block6_expanding_conv_b[0][0]
up_sampling2d_6 (UpSampling2D) (None, 24, 24, 512) 0 batch_normalization_15[0][0]
cropping2d_6 (Cropping2D) (None, 24, 24, 256) 0 block3_contracting_conv_b[0][0]
block7_expanding_conv_up (Conv2 (None, 24, 24, 256) 524544 up_sampling2d_6[0][0]
concatenate_6 (Concatenate) (None, 24, 24, 512) 0 cropping2d_6[0][0]
block7_expanding_conv_a (Conv2D (None, 24, 24, 256) 1179904 concatenate_6[0][0]
block7_expanding_conv_b (Conv2D (None, 24, 24, 256) 590080 block7_expanding_conv_a[0][0]
batch_normalization_16 (BatchNo (None, 24, 24, 256) 1024 block7_expanding_conv_b[0][0]
up_sampling2d_7 (UpSampling2D) (None, 48, 48, 256) 0 batch_normalization_16[0][0]
cropping2d_7 (Cropping2D) (None, 48, 48, 128) 0 block2_contracting_conv_b[0][0]
block8_expanding_conv_up (Conv2 (None, 48, 48, 128) 131200 up_sampling2d_7[0][0]
concatenate_7 (Concatenate) (None, 48, 48, 256) 0 cropping2d_7[0][0]
block8_expanding_conv_a (Conv2D (None, 48, 48, 128) 295040 concatenate_7[0][0]
block8_expanding_conv_b (Conv2D (None, 48, 48, 128) 147584 block8_expanding_conv_a[0][0]
batch_normalization_17 (BatchNo (None, 48, 48, 128) 512 block8_expanding_conv_b[0][0]
up_sampling2d_8 (UpSampling2D) (None, 96, 96, 128) 0 batch_normalization_17[0][0]
cropping2d_8 (Cropping2D) (None, 96, 96, 64) 0 block1_contracting_conv_b[0][0]
block9_expanding_conv_up (Conv2 (None, 96, 96, 64) 32832 up_sampling2d_8[0][0]
concatenate_8 (Concatenate) (None, 96, 96, 128) 0 cropping2d_8[0][0]
block9_expanding_conv_a (Conv2D (None, 96, 96, 64) 73792 concatenate_8[0][0]
block9_expanding_conv_b (Conv2D (None, 96, 96, 64) 36928 block9_expanding_conv_a[0][0]
batch_normalization_18 (BatchNo (None, 96, 96, 64) 256 block9_expanding_conv_b[0][0]
class_output (Conv2D) (None, 96, 96, 4) 260 batch_normalization_18[0][0]
Total params: 31,039,620
Trainable params: 31,035,780
Non-trainable params: 3,840
Total params: 31,031,940
Trainable params: 31,031,940
Non-trainable params: 0
class percentages in dataset:
{0: 0.6245757457188198,
1: 0.16082110268729075,
2: 0.1188858904157366,
3: 0.09571726117815291}
class 0 is the background
shape of image from generator (rgb): (1, 96, 96, 3)
shape of labels from generator: (1, 96, 96, 4)
There doesn't seem to be anything that wrong in your model.
Softmax is ok, as it defaults to the last axis, and you're clearly using 'channels_last' as config. So it's ok.
Suggestions are:
Add a few BatchNormalization() layers and decrease your learning rate (this prevents relu from going too fast to "all zeroes").
Check that your output data range is correct, with np.unique(y_train) containing only 0 and 1
Check that every pixel is classified with only one class: (np.sum(y_train, axis=-1) == 1).all() == True.
Check if your images aren't too biased towards the first class. np.sum(y_train[:,:,:,0]) should not be too bigger than np.sum(y_train[:,:,:,1:]).
If it is, consider fitting with the class_weight parameter, passing weights to balance the loss for each class (check keras documentation on fit for how to use it)
This model works just fine for me with most of the segmentation projects, i use crossentropy for multiclass segmentation and smooth dice for binary classes
def conv_block(tensor, nfilters, size=3, padding='same', initializer="he_normal"):
x = Conv2D(filters=nfilters, kernel_size=(size, size), padding=padding, kernel_initializer=initializer)(tensor)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Conv2D(filters=nfilters, kernel_size=(size, size), padding=padding, kernel_initializer=initializer)(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
return x
def deconv_block(tensor, residual, nfilters, size=3, padding='same', strides=(2, 2)):
y = Conv2DTranspose(nfilters, kernel_size=(size, size), strides=strides, padding=padding)(tensor)
y = concatenate([y, residual], axis=3)
y = conv_block(y, nfilters)
return y
def Unet(img_height, img_width, nclasses=3, filters=64):
# down
input_layer = Input(shape=(img_height, img_width, 3), name='image_input')
conv1 = conv_block(input_layer, nfilters=filters)
conv1_out = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = conv_block(conv1_out, nfilters=filters*2)
conv2_out = MaxPooling2D(pool_size=(2, 2))(conv2)
conv3 = conv_block(conv2_out, nfilters=filters*4)
conv3_out = MaxPooling2D(pool_size=(2, 2))(conv3)
conv4 = conv_block(conv3_out, nfilters=filters*8)
conv4_out = MaxPooling2D(pool_size=(2, 2))(conv4)
conv4_out = Dropout(0.5)(conv4_out)
conv5 = conv_block(conv4_out, nfilters=filters*16)
conv5 = Dropout(0.5)(conv5)
# up
deconv6 = deconv_block(conv5, residual=conv4, nfilters=filters*8)
deconv6 = Dropout(0.5)(deconv6)
deconv7 = deconv_block(deconv6, residual=conv3, nfilters=filters*4)
deconv7 = Dropout(0.5)(deconv7)
deconv8 = deconv_block(deconv7, residual=conv2, nfilters=filters*2)
deconv9 = deconv_block(deconv8, residual=conv1, nfilters=filters)
# output
output_layer = Conv2D(filters=nclasses, kernel_size=(1, 1))(deconv9)
output_layer = BatchNormalization()(output_layer)
output_layer = Activation('softmax')(output_layer)
model = Model(inputs=input_layer, outputs=output_layer, name='Unet')
return model
Sometimes, the problem is related to model architecture. When you are dealing with a complicated dataset for segmentation, you need to enhance the model architecture. I encountered the same problem with a new dataset while the model could work well on another dataset. So, I used Res-Unet instead of Unet as the model architecture and the problem solved.
hope this will help
