Jointly optimizing autoencoder and fully connected network for classification - keras

I have a large set of unlabeled data and a smaller set of labeled data. Thus, I would like to first train a variational autoencoder on the unlabeled data and then use the encoder for classification of three classes (with a fully connected layer attached) on the labeled data. For optimization of the hyperparameters I would like to use Optuna.
One possibility would be to first optimize the autoencoder and then optimize the fully connected network (classification) but then the autoencoder might learn an encoding which is meaningless for the classification.
Is there a possibility to jointly optimize the autoencoder and the fully connected network?
My autoencoder looks as follows (params is just a dictionary holding the params):
inputs = Input(shape=image_size, name='encoder_input')
x = inputs
for i in range(len(params["conv_filter_encoder"])):
x, _ = convolutional_unit(x, params["conv_filter_encoder"][i], params["conv_kernel_size_encoder"][i], params["strides_encoder"][i],
batchnorm=params["batchnorm"][i], dropout=params["dropout"][i], maxpool=params["maxpool"][i], deconv=False)
shape = K.int_shape(x)
x = Flatten()(x)
x = Dense(params["inner_dim"], activation='relu')(x)
z_mean = Dense(params["latent_dim"], name='z_mean')(x)
z_log_var = Dense(params["latent_dim"], name='z_log_var')(x)
# use reparameterization trick to push the sampling out as input
# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(params["latent_dim"],), name='z')([z_mean, z_log_var])
# instantiate encoder model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
# build decoder model
latent_inputs = Input(shape=(params["latent_dim"],), name='z_sampling')
x = Dense(params["inner_dim"], activation='relu')(latent_inputs)
x = Dense(shape[1] * shape[2] * shape[3], activation='relu')(x)
x = Reshape((shape[1], shape[2], shape[3]))(x)
len_batchnorm = len(params["batchnorm"])
len_dropout = len(params["dropout"])
for i in range(len(params["conv_filter_decoder"])):
x, _ = convolutional_unit(x, params["conv_filter_decoder"][i], params["conv_kernel_size_decoder"][i], params["strides_decoder"][i],
batchnorm=params["batchnorm"][len_batchnorm-i-1], dropout=params["dropout"][len_dropout-i-1], maxpool=None, deconv=True, activity_regularizer=params["activity_regularizer"])
outputs = Conv2DTranspose(filters=1,
kernel_size=params["conv_kernel_size_decoder"][len(params["conv_kernel_size_decoder"])-1],
activation='sigmoid',
padding='same')(x)
# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')
# instantiate VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae')
vae.higgins_beta = K.variable(value=params["beta"])
loss = config["loss"].value
def vae_loss(x, x_decoded_mean):
"""VAE loss function"""
# VAE loss = mse_loss or xent_loss + kl_loss
if loss == Loss.mse.value:
reconstruction_loss = mse(K.flatten(x), K.flatten(x_decoded_mean))
elif loss == Loss.bce.value:
reconstruction_loss = binary_crossentropy(K.flatten(x),
K.flatten(x_decoded_mean))
else:
raise ValueError("Loss unknown")
reconstruction_loss *= image_size[0] * image_size[1]
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
# kl_loss *= -0.5
kl_loss *= -vae.higgins_beta
vae_loss = K.mean(reconstruction_loss + kl_loss)
return vae_loss
batch_size = params["batch_size"]
optimizer = keras.optimizers.Adam(lr=params["learning_rate"], beta_1=0.9, beta_2=0.999,
epsilon=1e-08, decay=params["learning_rate_decay"])
vae.compile(loss=vae_loss, optimizer=optimizer)
vae.fit(train_X, train_X,
epochs=config.CONFIG["n_epochs"],
batch_size=batch_size,
verbose=0,
callbacks=get_callbacks(config.CONFIG, autoencoder_path, encoder, decoder, vae),
shuffle=shuffle,
validation_data=(valid_X, valid_X))
My fully connected network attached to the encoder looks as follows:
latent = vae.predict(images)[0]
inputs = Input(shape=(input_shape,), name='fc_input')
den = inputs
for i in range(len(self.params["units"])):
den = Dense(self.params["units"][i])(den)
den = Activation('relu')(den)
out = Dense(self.num_classes, activation='softmax')(den)
model = Model(inputs, out, name='fcnn')
optimizer = keras.optimizers.Adam(lr=self.mc.CONFIG["fcnn"]["learning_rate"], beta_1=0.9, beta_2=0.999,
epsilon=1e-08, decay=self.mc.CONFIG["fcnn"]["learning_rate_decay"])
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(latent, y,
epochs=self.params["n_epochs"],
batch_size=self.params["batch_size"],
verbose=0,
shuffle=True)
y_prob = model.predict(latent)

Related

Inferece model creation using model keras

Decoder model inference is giving error graph disconnected error during inference Decoder model inference is giving error graph disconnected error during inference Decoder model inference is giving error graph disconnected error during inference
# TRAINING WITH TEACHER FORCING
# Define an input sequence and process it.
encoder_inputs= Input(shape=(n_timesteps_in, n_features))
encoder_lstm = LSTM(LSTMoutputDimension, return_sequences=True, return_state=True, name='encoder_lstm')
LSTM_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
# We discard `LSTM_outputs` and only keep the other states.
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(None, n_features), name='decoder_inputs')
attention= BahdanauAttention(LSTMoutputDimension, verbose = 1)
decoder_lstm = LSTM(LSTMoutputDimension, return_sequences=True, name='decoder_lstm')
decoder_lstm1 = LSTM(LSTMoutputDimension, return_sequences=True, return_state=True, name='decoder_lstm1')
# Set up the decoder, using `context vector` as initial state.
decoder_outputs= decoder_lstm(decoder_inputs,initial_state=encoder_states)
context_vector, weights = attention(decoder_outputs, LSTM_outputs)
#context_vector = tf.expand_dims(context_vector, 1)
decoder_outputs2 = tf.concat([context_vector, decoder_outputs], axis=-1)
decoder_outputs, s, h = decoder_lstm1(decoder_outputs2)
#complete the decoder model by adding a Dense layer with Softmax activation function
#for prediction of the next output
#Dense layer will output one-hot encoded representation as we did for input
#Therefore, we will use n_features number of neurons
decoder_dense = Dense(n_features, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)
Inference given under is not working
decoder_state_input_h = Input(shape=(LSTMoutputDimension,))
decoder_state_input_c = Input(shape=(LSTMoutputDimension,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
context_vector, weights = attention(decoder_outputs, LSTM_outputs)
decoder_outputs2 = tf.concat([context_vector, decoder_outputs], axis=-1)
decoder_outputs3, dh, dc = decoder_lstm1(decoder_outputs2)
deStates = [dh, dc]
decoder_model = Model( [decoder_inputs] + decoder_states_inputs, [decoder_outputs3] +deStates)

How to get logit matrix from a customized CNN?

This is my model
engine1 = tf.keras.applications.Xception(
# Freezing the weights of the top layer in the InceptionResNetV2 pre-traiined model
include_top = False,
# Use Imagenet weights
weights = 'imagenet',
# Define input shape to 224x224x3
input_shape = (256, 256 , 3)
)
x1 = tf.keras.layers.GlobalAveragePooling2D(name = 'avg_pool')(engine1.output)
x1 =tf.keras.layers.Dropout(0.75)(x1)
x1 = tf.keras.layers.BatchNormalization(
axis=-1,
momentum=0.99,
epsilon=0.01,
center=True,
scale=True,
beta_initializer="zeros",
gamma_initializer="ones",
moving_mean_initializer="zeros",
moving_variance_initializer="ones",
)(x1)
out1 = tf.keras.layers.Dense(3, activation = 'softmax', name = 'dense_output')(x1)
# Build the Keras model
model1 = tf.keras.models.Model(inputs = engine1.input, outputs = out1)
# Compile the model
model1.compile(
# Set optimizer to Adam(0.0001)
optimizer = tf.keras.optimizers.Adam(learning_rate= 3e-4),
#optimizer= SGD(lr=0.001, decay=1e-6, momentum=0.99, nesterov=True),
# Set loss to binary crossentropy
#loss = tf.keras.losses.SparseCategoricalCrossentropy(),
loss = 'categorical_crossentropy',
# Set metrics to accuracy
metrics = ['accuracy']
)
I want logits so I wrote this
logits = model1(X_test)
probs = tf.nn.softmax(logits)
Getting error as
ResourceExhaustedError: OOM when allocating tensor with shape[1288,64,125,125] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Conv2D]
How to fix this and get the logits? I want to apply the distillation method after getting the logits. My test set consists of 3 classes and 60 samples.
so logit matrix should be a matrix of 60 * 3.
Edit
To get the logits(1288 * 3) I made a change in the output layer of my model
out1 = tf.keras.layers.Dense(3, activation = 'linear', name = 'dense_output')(x1)
Now I am getting logits,
y_pred_logits = model1.predict(X_test)
I want to apply softmax on this, My softmax function looks like this,
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x)
return e_x / e_x.sum(axis=1)
But when I am doing this
y_pred_logits_activated = softmax(y_pred_logits)
Getting errors as
How to fix this and is this method correct? Further, I want to apply this on logits

TensorFlow 2.0 'build' function

I was reading about creating neural networks using TensorFlow 2.0 in conjunction with 'GradientTape' API and came across the following code:
model = tf.keras.Sequential((
tf.keras.layers.Reshape(target_shape=(28 * 28,), input_shape=(28, 28)),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(10)))
model.build()
optimizer = tf.keras.optimizers.Adam()
In this code, what's the use/function of 'model.build()'? Is it compiling the designed neural network?
The rest of the code is:
compute_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
compute_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
def train_one_step(model, optimizer, x, y):
with tf.GradientTape() as tape:
logits = model(x)
loss = compute_loss(y, logits)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
compute_accuracy(y, logits)
return loss
#tf.function
def train(model, optimizer):
train_ds = mnist_dataset()
step = 0
loss = 0.0
accuracy = 0.0
for x, y in train_ds:
step += 1
loss = train_one_step(model, optimizer, x, y)
if step % 10 == 0:
tf.print('Step', step, ': loss', loss, '; accuracy', compute_accuracy.result())
return step, loss, accuracy
step, loss, accuracy = train(model, optimizer)
print('Final step', step, ': loss', loss, '; accuracy', compute_accuracy.result())
They refer to this as the "delayed-build pattern", where you can actually create a model without defining what its input shape is.
For example
model = Sequential()
model.add(Dense(32))
model.add(Dense(32))
model.build((None, 500))
is equivalent to
model = Sequential()
model.add(Dense(32, input_shape=(500,)))
model.add(Dense(32))
In the second case you need to know the input shape before defining the model's architecture. model.build() allows you to actually define a model (i.e. its architecture) and actually build it (i.e. initialize parameters, etc.) later.
Example taken from here.

How to correctly train VGG16 Keras

I'm trying to retrain VGG16 to classify Lego images. However, my model has a low accuracy (between 20%). What am I doing wrong? Maybe the number of FC is wrong, or my ImageDataGenerator. I have approx. 2k images per class and a total of 6 classes.
How I create the model:
def vgg16Model(self,image_shape,num_classes):
model_VGG16 = VGG16(include_top = False, weights = None)
model_input = Input(shape = image_shape, name = 'input_layer')
output_VGG16_conv = model_VGG16(model_input)
#Init of FC layers
x = Flatten(name='flatten')(output_VGG16_conv)
x = Dense(256, activation = 'relu', name = 'fc1')(x)
output_layer = Dense(num_classes,activation='softmax',name='output_layer')(x)
vgg16 = Model(inputs = model_input, outputs = output_layer)
vgg16.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
vgg16.summary()
return vgg16
I'm creating ImageDataGenerator and training:
path = "real_Legos_images/trainable_classes"
evaluate_path = "real_Legos_images/evaluation"
NN = NeuralNetwork()
gen = ImageDataGenerator(rotation_range=40, width_shift_range=0.02, shear_range=0.02,height_shift_range=0.02, horizontal_flip=True, fill_mode='nearest')
train_generator = gen.flow_from_directory(os.path.abspath(os.path.join(path)),
target_size = (224,224), color_mode = "rgb", batch_size = 16, class_mode='categorical')
validation_generator = gen.flow_from_directory(os.path.abspath(os.path.join(evaluate_path)),
target_size = (224,224), color_mode = "rgb", batch_size = 16, class_mode='categorical')
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
num_classes = len(os.listdir(os.path.abspath(os.path.join(path))))
VGG16 = NN.vgg16Model((224, 224, 3), num_classes)
VGG16.save_weights('weights.h5')
VGG16.fit_generator(train_generator, validation_data = validation_generator, validation_steps = validation_generator.n//validation_generator.batch_size,
steps_per_epoch = STEP_SIZE_TRAIN, epochs = 50)
The VGG16 model with the parameter include_top = False will return 512 dimensions feature maps. Usually, we should add a GlobalAveragePooling2D or GlobalMaxPooling2D layer after it first, then flat it to an one-dimensional array. Otherwise, you will get an array which is too long to fit.
You have set the weight property to 'None' for VGG which means your networks is initialized with random weights. This means you are not using the pre-trained weights. So, I would suggest to try setting the weights to 'imagenet' such that you can use the VGG networks that its weights are pretrained on imagenet dataset:
model_VGG16 = VGG16(include_top=False, weights='imagenet')

Provide POS tags as input to RNN alongwith word embedding

I have to feed input to the RNN as word embedding + POS tags. But word embedding is generated by code only. So I cannot concat embedding and POS one hot vector. What is the best way to do this task?
def BidLstm(maxlen, N_TAGS, EMBEDDING_DIM, embedding_matrix):
inp = Input(shape=(maxlen,), dtype='int32')
x = Embedding(len(word2int) + 1,
EMBEDDING_DIM,
weights=[embedding_matrix],
input_length=maxlen,
trainable=False)(inp)
x = Bidirectional(LSTM(300, return_sequences=True, dropout=0.25,
recurrent_dropout=0.25))(x)
xa = Attention(maxlen)(x)
xd = Dense(256, activation="relu")(xa)
xdp = Dropout(0.25)(xd)
xd1 = Dense(5, activation="softmax")(xdp)
#x = TimeDistributed(Dense(N_TAGS + 1, activation='softmax'))(x)
model = Model(inputs=inp, outputs=xd1)
return model
model = BidLstm(max(sentence_length_list),5, EMBEDDING_DIM, embedding_matrix)
model.compile(loss='binary_crossentropy', optimizer='adam',
metrics=['accuracy'])
file_path = ".model.hdf5"
ckpt = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,
save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=1)
model.fit(X_train, Y_train_onehot, batch_size=32, epochs=15, validation_split=0.1, callbacks=[ckpt, early])

Resources