Keras documentation about fine-tuning states that it is important to "keep the BatchNormalization layers in inference mode by passing training=False when calling the base model.". (What is interesting, that every non-official example that I've found about the topic ignores this setting.)
Documentation follows up with example:
from tensorflow import keras
from keras.applications.xception import Xception
base_model = keras.applications.Xception(
weights='imagenet', # Load weights pre-trained on ImageNet.
input_shape=(150, 150, 3),
include_top=False) # Do not include the ImageNet classifier at the top.
base_model.trainable = False
inputs = keras.Input(shape=(150, 150, 3))
scale_layer = keras.layers.Rescaling(scale=1 / 127.5, offset=-1)
x = scale_layer(x)
# We make sure that the base_model is running in inference mode here,
# by passing `training=False`. This is important for fine-tuning, as you will
# learn in a few paragraphs.
x = base_model(x, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(1)(x)
model = keras.Model(inputs , outputs)
The thing is that the example is adding preprocessing to the base model and my model(EfficientNetB3) has already preprocessing included and I don't know how to set my base_model with `training=False`` without prepending it with additional layer:
base_model = EfficientNetB3(weights='imagenet', include_top=False, input_shape=input_shape)
base_model.trainable=False
model = Sequential()
model.add(base_model) # How to set base_model training=False?
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.2))
model.add(Dense(10, activation="softmax", name="classifier"))
How to prove that training=False or training=True has an effect:
#Frightera explained to me how to "lock" the model's state and I wanted to prove to myself that the lock happens by checking BatchNormalization non-trainable variables. My understating is that if I call model with training=True then it should update the variables. However, this is not the case, or am I missing something?
import tensorflow as tf
from tensorflow import keras
from keras.applications.efficientnet import EfficientNetB3
import numpy as np
class WrappedEffNet(keras.layers.Layer):
def __init__(self, **kwargs):
super(WrappedEffNet, self).__init__(**kwargs)
self.model = EfficientNetB3(weights='imagenet',
include_top=False,
input_shape=(224, 224, 3))
self.model.trainable=False
def call(self, x, training=False):
return self.model(x, training=training) # Modified to pass also True.
base_model_wrapped = WrappedEffNet()
random_vector = tf.random.uniform((1, 224, 224, 3))
o1 = base_model_wrapped(random_vector)
o2 = base_model_wrapped(random_vector, training = False)
# Getting all non-trainable variable values from all BatchNormalization layers.
array_a = np.array([])
for layer in base_model_wrapped.model.layers:
if hasattr(layer, 'moving_mean'):
v = layer.moving_mean.numpy()
np.concatenate([array_a, v])
v = layer.moving_variance.numpy()
np.concatenate([array_a, v])
o3 = base_model_wrapped(random_vector, training = True) # Changing to True, shouldn't this update BatchNormalization non-trainable variables?
array_b = np.array([])
for layer in base_model_wrapped.model.layers:
if hasattr(layer, 'moving_mean'):
v = layer.moving_mean.numpy()
np.concatenate([array_b, v])
v = layer.moving_variance.numpy()
np.concatenate([array_b, v])
print(np.allclose(array_a, array_b)) # Shouldn't this be False?
It is not possible to invoke the call method of the base model in Sequential model as in Functional. However, you can think the model as if it is a custom layer:
class WrappedEffNet(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super(WrappedEffNet, self).__init__(**kwargs)
self.model = keras.applications.EfficientNetB3(weights='imagenet',
include_top=False,
input_shape=(224, 224, 3))
self.model.trainable=False
def call(self, x, training):
return self.model(x, training=False)
Sanity check:
base_model_wrapped = WrappedEffNet()
random_vector = tf.random.uniform((1, 224, 224, 3))
o1 = base_model_wrapped(random_vector)
o2 = base_model_wrapped(random_vector, training = False)
o3 = base_model_wrapped(random_vector, training = True)
np.allclose(o1, o2), np.allclose(o1, o3), np.allclose(o2, o3)
# (True, True, True)
It is inference mode regardless of the value of training.
Model summary is the same as Sequential:
Layer (type) Output Shape Param #
=================================================================
wrapped_eff_net (WrappedEff (1, 7, 7, 1536) 10783535
Net)
global_average_pooling2d (G (1, 1536) 0
lobalAveragePooling2D)
dropout (Dropout) (1, 1536) 0
classifier (Dense) (1, 10) 15370
=================================================================
Total params: 10,798,905
Trainable params: 15,370
Non-trainable params: 10,783,535
_________________________________________________________________
Edit: In order to see difference of BatchNormalization:
import tensorflow as tf
import numpy as np
x = np.random.randn(1, 2) * 20 + 0.1
bn = tf.keras.layers.BatchNormalization()
input_layer = tf.keras.layers.Input((x.shape[-1], ))
output = bn(input_layer )
model = tf.keras.Model(inputs=input_layer , outputs=output)
model.trainable = False:
model.trainable = False
for i in range(2):
print('Input:', x)
print('Moving mean:', model.layers[1].moving_mean.numpy())
print('training = True -->', model(x, training = True).numpy())
print('training = False -->', model(x, training = False).numpy())
print()
Input: [[ 2.50317905 12.44406219]]
Moving mean: [0. 0.]
training = True --> [[ 2.5019286 12.437845 ]]
training = False --> [[ 2.5019286 12.437845 ]]
Input: [[ 2.50317905 12.44406219]]
Moving mean: [0. 0.]
training = True --> [[ 2.5019286 12.437845 ]]
training = False --> [[ 2.5019286 12.437845 ]]
model.trainable = True, training = True:
model.trainable = True
for i in range(2):
print('Input:', x)
print('Moving mean:', model.layers[1].moving_mean.numpy())
print('training = True -->', model(x, training = True).numpy())
print()
Input: [[ 2.50317905 12.44406219]]
Moving mean: [0. 0.]
training = True --> [[0. 0.]]
Input: [[ 2.50317905 12.44406219]]
Moving mean: [0.02503179 0.12444062]
training = True --> [[0. 0.]]
model.trainable = True, training = False:
model.trainable = True
for i in range(2):
print('Input:', x)
print('Moving mean:', model.layers[1].moving_mean.numpy())
print('training = False -->', model(x, training = False).numpy())
print()
Input: [[ 2.50317905 12.44406219]]
Moving mean: [0.04981326 0.24763682]
training = False --> [[ 2.476884 12.313342]]
Input: [[ 2.50317905 12.44406219]]
Moving mean: [0.04981326 0.24763682]
training = False --> [[ 2.476884 12.313342]]
Related
I am trying to do categorical classification of data. The data consists of 3 text variables, and one real value. I split the data into three sets - training, validation and testing. I am using tensorflow and python. For the test data I get the following stats: test data statistics
A categorical accuracy of 0.9919, however when I perform a prediction on the same test data, and evaluate the accuracy with the sci-kit classification_report function, I get an accuracy of 0.60, as seen here: classification report.
df.info() looks like this ('category' being the Y value): info. I don't think that the sci-kit learn accuracy statistic is misrepresenting the prediction, since the confusion matrix looks like this.
I have re-built the model multiple times, and tried balancing classes with class weights just in case, however this still wouldn't explain the discrepancy between tensorflows accuracy, and the sci-kit one (gained by prediction).
The code:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
df = pd.read_csv("procData_nosub.csv")
df = df.sample(frac=1).reset_index(drop=True) # Shuffling
df = df[(df.category != 'Pictures')] # Removing small categories
df = df[(df.category != 'Software')]
df = df.drop("fileAmount", axis=1)
df = df.drop("more100Files", axis=1)
train, val, test = np.split(df.sample(frac=1), [int(0.8 * len(df)), int(0.9 * len(df))])
# Function to convert dataframe to dataset (from https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers)
def df_to_dataset(dataframe, shuffle=True, batch_size=64):
df = dataframe.copy()
labels = df.pop('category')
df = {key: value[:,tf.newaxis] for key, value in df.items()}
ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(df))
ds = ds.batch(batch_size)
ds = ds.prefetch(batch_size)
return ds
train_data = df_to_dataset(train)
test_data = df_to_dataset(test)
validation_data = df_to_dataset(val)
# Function to convert text (for the Y) to one-hot-encoding
catVals = np.unique(df['category'])
table = tf.lookup.StaticHashTable(
initializer = tf.lookup.KeyValueTensorInitializer(
keys = tf.constant(catVals),
values = list(range(len(catVals)))
),
default_value = -1,
name = "target_encoding"
)
#tf.function
def target(x):
return table.lookup(x)
def fetch(features, labels):
return features, tf.one_hot(target(labels), len(catVals))
# Applying the text (Y) -> one-hot-encoding
train_data_f = train_data.map(fetch)
test_data_f = test_data.map(fetch)
validation_data_f = validation_data.map(fetch)
# Using an encoder
embedding = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
hub_layer = hub.KerasLayer(embedding, output_shape = 512, input_shape = [], dtype=tf.string, trainable=True)
# Normalizing real values (from https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers)
def get_normalization_layer(name, dataset):
normalizer = tf.keras.layers.Normalization(axis=None)
feature_ds = dataset.map(lambda x, y: x[name])
normalizer.adapt(feature_ds)
return normalizer
all_inputs = []
encoded_features = []
# Adding filesize to features
for header in ['fileSize']: # fileAmount
numeric_col = tf.keras.Input(shape=(1,), name=header)
normalization_layer = get_normalization_layer(header, train_data_f)
encoded_numeric_col = normalization_layer(numeric_col)
all_inputs.append(numeric_col)
encoded_features.append(encoded_numeric_col)
# Adding title, description, files to features
for header in ['title', 'description', 'files']:
text_col = tf.keras.Input(shape=(), name=header, dtype='string')
encoded_text_col = hub_layer(text_col)
all_inputs.append(text_col)
encoded_features.append(encoded_text_col)
# Describing the model
all_features = tf.keras.layers.concatenate(encoded_features)
x = all_features
for i in range(3):
x = tf.keras.layers.Dense(16, activation='relu')(x)
# x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(len(catVals), activation='softmax')(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
loss=tf.keras.losses.CategoricalCrossentropy(), # deleted from_logits=True
metrics=["categorical_accuracy",
tf.keras.metrics.Precision(),
tf.keras.metrics.Recall(),
tfa.metrics.F1Score(num_classes=len(catVals),
average='macro',
threshold=0.5)])
history = model.fit(train_data_f, epochs=5, validation_data=validation_data_f) # removed class weights
model.evaluate(test_data_f)
model.evaluate(validation_data_f)
# Taking x and y from the test data
test_x = test_data_f.unbatch().map(lambda x, y: x)
test_y = test_data_f.unbatch().map(lambda x, y: y)
test_predicted = model.predict(test_x)
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# converting test_y to a numpy array (from a tensorflow Dataset)
test_y = np.array([x for x in test_y])
print(classification_report(test_y.argmax(1), test_predicted.argmax(1)))
print(tf.math.confusion_matrix(test_y.argmax(1), test_predicted.argmax(1)))
As a TLDR: I am using universal-sentence-encoder-multilingual/3 from tensorflow hub, and the model looks like this: model build, fit history. Model summary:
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
fileSize (InputLayer) [(None, 1)] 0 []
title (InputLayer) [(None,)] 0 []
description (InputLayer) [(None,)] 0 []
files (InputLayer) [(None,)] 0 []
normalization (Normalization) (None, 1) 3 ['fileSize[0][0]']
keras_layer (KerasLayer) (None, 512) 68927232 ['title[0][0]',
'description[0][0]',
'files[0][0]']
concatenate (Concatenate) (None, 1537) 0 ['normalization[0][0]',
'keras_layer[0][0]',
'keras_layer[1][0]',
'keras_layer[2][0]']
dense (Dense) (None, 16) 24608 ['concatenate[0][0]']
dense_1 (Dense) (None, 16) 272 ['dense[0][0]']
dense_2 (Dense) (None, 16) 272 ['dense_1[0][0]']
dense_3 (Dense) (None, 4) 68 ['dense_2[0][0]']
==================================================================================================
Total params: 68,952,455
Trainable params: 68,952,452
Non-trainable params: 3
__________________________________________________________________________________________________
I have built a multi-input (100 features) multi-ouput (100 predictions) ANN model using keras and tensorflow. I have been able to train my model and reach a quite satisfying accuracy on the test set using the following code :
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
def my_loss_fn(y_true, y_pred) :
d = K.sum(K.abs(y_true), axis = -1)
n = K.sum((K.tanh(100000*y_true*y_pred)/2 + 0.5)*K.abs(y_true), axis = -1)
return 1 - n/d
def my_metric_fn(y_true, y_pred) :
d = K.sum(K.abs(y_true))
n = K.sum((K.tanh(100000*y_true*y_pred)/2 + 0.5)*K.abs(y_true))
return n/d
def accuracy(y_true, y_pred) :
#print(y_true.shape, y_true)
#print(y_pred.shape, y_true)
acc = np.zeros([1, len(y_true)])
for day in range(len(y_pred)) :
d = 0
n = 0
for i in range(len(y_pred[0])) :
d = d + abs(y_true[day, i])
if np.sign(y_pred[day, i])*np.sign(y_true[day, i]) > 0 :
n = n + abs(y_true[day, i])
else :
n = n + 0
acc[0, day] = n/d
return np.mean(acc, axis = -1)[0]
#Model
classifier = Sequential()
classifier.add(Dense(units = 50, input_shape = (100, ), activation = "tanh"))
classifier.add(Dropout(0.2))
classifier.add(Dense(units=100, activation = 'tanh'))
classifier.compile(optimizer = 'rmsprop', loss = my_loss_fn, metrics = ['accuracy', my_metric_fn])
#Training
callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 0.0001, patience = 20, verbose = 0, mode = 'min')
nb_epochs = 250
history = classifier.fit(X_train, y_train, epochs = nb_epochs, batch_size = 31, callbacks = [callback], verbose = True, validation_split = 0., validation_data = (X_test, y_test), use_multiprocessing = True)
#Prediction
y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_test)
acc_test = accuracy(y_test, y_pred_test)
acc_train = accuracy(y_train, y_pred_train)
I am trying to improve the performance of my model by tuning the hyperparameters so I used KerasClassifier() and GridSearchCV(). The following code illustrates my approach for the gridsearch.
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from tensorflow import autograph
#Building a function to create the classifier
def build_classifier(nb_layers, nb_nodes, optimizer, dropout, activation_fn):
classifier=Sequential()
classifier.add(Dense(units = nb_nodes, input_shape = (100, ), activation = activation_fn))
for i in range(nb_layers-1) :
classifier.add(Dense(units = nb_nodes, activation = activation_fn, kernel_initializer = "uniform"))
classifier.add(Dropout(dropout))
classifier.add(Dense(units = 100, activation = 'tanh'))
classifier.compile(optimizer=optimizer, loss = tf.autograph.experimental.do_not_convert(my_loss_fn), metrics= ['accuracy', tf.autograph.experimental.do_not_convert(my_metric_fn)])
return classifier
#Creating a scorer to feed to the GridSearchCV()
my_scorer = make_scorer(accuracy, greater_is_better = True)
classifier=KerasClassifier(build_fn=build_classifier)
parameters={'batch_size':[13, 31],'epochs':[100, 150], 'optimizer':['adam', 'rmsprop'], 'dropout' : [0.2, 0.1], 'nb_layers' : [2, 3], 'nb_nodes' : [45, 50, 110, 115], 'activation_fn' : ['relu', 'tanh']}
grid_search=GridSearchCV(estimator=classifier, scoring = my_scorer, param_grid=parameters, cv=5, verbose = 1)
grid_search=grid_search.fit(X_train_, y_train_raw)
When I fit my GridSearchCV() object I get the following error at the end of the first combination of hyperparameters (when the scoring is computed) :
TypeError: object of type 'numpy.int32' has no len()
I investigated by adding print commandes inside my accuracy() function
#print(y_true.shape, y_true)
#print(y_pred.shape, y_pred)
to print both the shape and the array y_true and y_pred given as inputs for my accuracy() function used as the scoring in the GridSearchCV() object.
I found out that y_true.shape == (555, 100) but y_pred.shape == (555,). The value 555 corresponds to the number of lines of the fifth validation set because cv = 5.
However, I do not understand why the prediction of the gridsearch is not a multi-output prediction even though the number of nodes of the last layer of the classifier is (100,).
This was a regression problem so I used KerasRegressor() instead and it solved the issue. I guess that for a multi-output classification problem, KerasClassifier() expect the output to be a 2D hot encoded array.
I am having a hard time translating a quite simple LSTM model from Keras to Pytorch. X (get it here) corresponds to 1152 samples of 90 timesteps, each timestep has only 1 dimension. y (here) is a single prediction at t = 91 for all 1152 samples.
In Keras:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
import numpy as np
import pandas as pd
X = pd.read_csv('X.csv', header = None).values
X.shape
y = pd.read_csv('y.csv', header = None).values
y.shape
# From Keras documentation [https://keras.io/layers/recurrent/]:
# Input shape 3D tensor with shape (batch_size, timesteps, input_dim).
X = np.reshape(X, (1152, 90, 1))
regressor = Sequential()
regressor.add(LSTM(units = 100, return_sequences = True, input_shape = (90, 1)))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.3))
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.3))
regressor.add(Dense(units = 1, activation = 'linear'))
regressor.compile(optimizer = 'rmsprop', loss = 'mean_squared_error', metrics = ['mean_absolute_error'])
regressor.fit(X, y, epochs = 10, batch_size = 32)
... leads me to:
# Epoch 10/10
# 1152/1152 [==============================] - 33s 29ms/sample - loss: 0.0068 - mean_absolute_error: 0.0628
Then in Pytorch:
import torch
from torch import nn, optim
from sklearn.metrics import mean_absolute_error
X = pd.read_csv('X.csv', header = None).values
y = pd.read_csv('y.csv', header = None).values
X = torch.tensor(X, dtype = torch.float32)
y = torch.tensor(y, dtype = torch.float32)
dataset = torch.utils.data.TensorDataset(X, y)
loader = torch.utils.data.DataLoader(dataset, batch_size = 32, shuffle = True)
class regressor_LSTM(nn.Module):
def __init__(self):
super().__init__()
self.lstm1 = nn.LSTM(input_size = 1, hidden_size = 100)
self.lstm2 = nn.LSTM(100, 50)
self.lstm3 = nn.LSTM(50, 50, dropout = 0.3, num_layers = 2)
self.dropout = nn.Dropout(p = 0.3)
self.linear = nn.Linear(in_features = 50, out_features = 1)
def forward(self, X):
# From the Pytorch documentation [https://pytorch.org/docs/stable/_modules/torch/nn/modules/rnn.html]:
# **input** of shape `(seq_len, batch, input_size)`
X = X.view(90, 32, 1)
# I am discarding hidden/cell states since in Keras I am using a stateless approach
# [https://keras.io/examples/lstm_stateful/]
X, _ = self.lstm1(X)
X = self.dropout(X)
X, _ = self.lstm2(X)
X = self.dropout(X)
X, _ = self.lstm3(X)
X = self.dropout(X)
X = self.linear(X)
return X
regressor = regressor_LSTM()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(regressor.parameters())
for epoch in range(10):
running_loss = 0.
running_mae = 0.
for i, data in enumerate(loader):
inputs, labels = data
optimizer.zero_grad()
outputs = regressor(inputs)
outputs = outputs[-1].view(*labels.shape)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
mae = mean_absolute_error(labels.detach().cpu().numpy().flatten(), outputs.detach().cpu().numpy().flatten())
running_mae += mae
print('EPOCH %3d: loss %.5f - MAE %.5f' % (epoch+1, running_loss/len(loader), running_mae/len(loader)))
... leads me to:
# EPOCH 10: loss 0.04220 - MAE 0.16762
You can notice that both loss and MAE are quite different (Pytorch's are much higher). If I use Pytorch's model to predict the values, they all return as a constant.
What am I doing wrong?
Oh I believe I made considerable progress. It seems that the way to represent y is different between Keras and Pytorch. In Keras, we should pass it as a single value representing one timestep in the future (or, at least, for the problem I am trying to solve). But in Pytorch, y must be X shifted one timestep to the future. It is like this:
time_series = [0, 1, 2, 3, 4, 5]
X = [0, 1, 2, 3, 4]
# Keras:
y = [5]
# Pytorch:
y = [1, 2, 3, 4, 5]
This way, Pytorch compares all values in the time slice when calculating loss. I believe Keras rearranges the data under the hood to conform to this approach, as the code works when fed the variables just like that. But in Pytorch, I was estimating loss based only on one value (the one I was trying to predict), not the whole series, therefore I believe it could not correctly capture the time dependency.
When taking this in consideration, I got to:
EPOCH 100: loss 0.00551 - MAE 0.058435
And, most importantly, comparing true and predicted values in a separate dataset got me to
The patterns were clearly captured by the model.
Hooray!
I tried to make a class using batchnormalization layer from tf 2.0, however it gave me an error that Gradients does not exist for variables. I tried to use batchnormalization directly but it gave me the same error as well. it seems like it is not traing the variable related to the batchnormalization step.
I tried to use model.trainable_variables instead of model.variables but it didn't work either.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy import ndimage
learning_rate = 0.001
training_epochs = 15
batch_size = 100
tf.random.set_seed(777)
cur_dir = os.getcwd()
ckpt_dir_name = 'checkpoints'
model_dir_name = 'minst_cnn_best'
checkpoint_dir = os.path.join(cur_dir, ckpt_dir_name, model_dir_name)
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, model_dir_name)
mnist = tf.keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images.astype(np.float32) /255.
test_images = test_images.astype(np.float32) /255.
print(train_images.shape, test_images.shape)
train_images = np.expand_dims(train_images, axis = -1)
test_images = np.expand_dims(test_images, axis = -1)
print(train_images.shape, test_images.shape)
train_labels = to_categorical(train_labels, 10)
test_labels = to_categorical(test_labels, 10)
train_dataset = tf.data.Dataset.from_tensor_slices((train_images,
train_labels)).shuffle(buffer_size = 100000).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((test_images,
test_labels)).batch(batch_size)
class ConvBNRelu(tf.keras.Model):
def __init__(self, filters, kernel_size=3, strides=1, padding='SAME'):
super(ConvBNRelu, self).__init__()
self.conv = keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides,
padding=padding, kernel_initializer='glorot_normal')
self.batchnorm = tf.keras.layers.BatchNormalization()
def call(self, inputs, training=False):
layer = self.conv(inputs)
layer = self.batchnorm(layer)
layer = tf.nn.relu(layer)
return layer
class DenseBNRelu(tf.keras.Model):
def __init__(self, units):
super(DenseBNRelu, self).__init__()
self.dense = keras.layers.Dense(units=units, kernel_initializer='glorot_normal')
self.batchnorm = tf.keras.layers.BatchNormalization()
def call(self, inputs, training=False):
layer = self.dense(inputs)
layer = self.batchnorm(layer)
layer = tf.nn.relu(layer)
return layer
class MNISTModel(tf.keras.Model):
def __init__(self):
super(MNISTModel, self).__init__()
self.conv1 = ConvBNRelu(filters=32, kernel_size=[3, 3], padding='SAME')
self.pool1 = keras.layers.MaxPool2D(padding='SAME')
self.conv2 = ConvBNRelu(filters=64, kernel_size=[3, 3], padding='SAME')
self.pool2 = keras.layers.MaxPool2D(padding='SAME')
self.conv3 = ConvBNRelu(filters=128, kernel_size=[3, 3], padding='SAME')
self.pool3 = keras.layers.MaxPool2D(padding='SAME')
self.pool3_flat = keras.layers.Flatten()
self.dense4 = DenseBNRelu(units=256)
self.drop4 = keras.layers.Dropout(rate=0.4)
self.dense5 = keras.layers.Dense(units=10, kernel_initializer='glorot_normal')
def call(self, inputs, training=False):
net = self.conv1(inputs)
net = self.pool1(net)
net = self.conv2(net)
net = self.pool2(net)
net = self.conv3(net)
net = self.pool3(net)
net = self.pool3_flat(net)
net = self.dense4(net)
net = self.drop4(net)
net = self.dense5(net)
return net
models = []
num_models = 5
for m in range(num_models):
models.append(MNISTModel())
def loss_fn(model, images, labels):
logits = model(images, training=True)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,
labels=labels))
return loss
def grad(model, images, labels):
with tf.GradientTape() as tape:
loss = loss_fn(model, images, labels)
return tape.gradient(loss, model.variables)
def evaluate(models, images, labels):
predictions = np.zeros_like(labels)
for model in models:
logits = model(images, training=False)
predictions += logits
correct_prediction = tf.equal(tf.argmax(predictions, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
return accuracy
optimizer = keras.optimizers.Adam(learning_rate = learning_rate)
checkpoints = []
for m in range(num_models):
checkpoints.append(tf.train.Checkpoint(cnn=models[m]))
for epoch in range(training_epochs):
avg_loss = 0.
avg_train_acc = 0.
avg_test_acc = 0.
train_step = 0
test_step = 0
for images, labels in train_dataset:
for model in models:
grads = grad(model, images, labels)
optimizer.apply_gradients(zip(grads, model.variables))
loss = loss_fn(model, images, labels)
avg_loss += loss / num_models
acc = evaluate(models, images, labels)
avg_train_acc += acc
train_step += 1
avg_loss = avg_loss / train_step
avg_train_acc = avg_train_acc / train_step
for images, labels in test_dataset:
acc = evaluate(models, images, labels)
avg_test_acc += acc
test_step += 1
avg_test_acc = avg_test_acc / test_step
print('Epoch:', '{}'.format(epoch + 1), 'loss =', '{:.8f}'.format(avg_loss),
'train accuracy = ', '{:.4f}'.format(avg_train_acc),
'test accuracy = ', '{:.4f}'.format(avg_test_acc))
for idx, checkpoint in enumerate(checkpoints):
checkpoint.save(file_prefix=checkpoint_prefix+'-{}'.format(idx))
print('Learning Finished!')
W0727 20:27:05.344142 140332288718656 optimizer_v2.py:982] Gradients does not exist for variables ['mnist_model/conv_bn_relu/batch_normalization/moving_mean:0', 'mnist_model/conv_bn_relu/batch_normalization/moving_variance:0', 'mnist_model/conv_bn_relu_1/batch_normalization_1/moving_mean:0', 'mnist_model/conv_bn_relu_1/batch_normalization_1/moving_variance:0', 'mnist_model/conv_bn_relu_2/batch_normalization_2/moving_mean:0', 'mnist_model/conv_bn_relu_2/batch_normalization_2/moving_variance:0', 'mnist_model/dense_bn_relu/batch_normalization_3/moving_mean:0', 'mnist_model/dense_bn_relu/batch_normalization_3/moving_variance:0'] when minimizing the loss.
W0727 20:27:05.407717 140332288718656 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py:460: BaseResourceVariable.constraint (from tensorflow.python.ops.resource_variable_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Apply a constraint manually following the optimizer update step.
W0727 20:27:05.499249 140332288718656 optimizer_v2.py:982] Gradients does not exist for variables ['mnist_model_1/conv_bn_relu_3/batch_normalization_4/moving_mean:0', 'mnist_model_1/conv_bn_relu_3/batch_normalization_4/moving_variance:0', 'mnist_model_1/conv_bn_relu_4/batch_normalization_5/moving_mean:0', 'mnist_model_1/conv_bn_relu_4/batch_normalization_5/moving_variance:0', 'mnist_model_1/conv_bn_relu_5/batch_normalization_6/moving_mean:0', 'mnist_model_1/conv_bn_relu_5/batch_normalization_6/moving_variance:0', 'mnist_model_1/dense_bn_relu_1/batch_normalization_7/moving_mean:0', 'mnist_model_1/dense_bn_relu_1/batch_normalization_7/moving_variance:0'] when minimizing the loss.
...
You're computing the gradient of the loss with respect to the model.variables: this collection contains not only the trainable variables (the model weights) but also the non-trainable variables like the moving mean and variance computed by the batch normalization layer.
You have to compute the gradient with respect to the trainable_variables. In short change the lines
return tape.gradient(loss, model.variables)
and
optimizer.apply_gradients(zip(grads, model.variables))
to
return tape.gradient(loss, model.trainable_variables)
and
optimizer.apply_gradients(zip(grads, model.trainable_variables))
Here is small snippet of my code describing my custom regularizer that I want to implement.
# Code adapted from https://github.com/keras-team/keras/issues/5563
class CustomRegularization(Layer):
def __init__(self, **kwargs):
super(CustomRegularization, self).__init__(**kwargs)
def call(self ,x ,mask=None):
ld=x[0]
rd=x[1]
reg = K.dot(K.transpose(ld), rd)
reg_norm = K.sqrt(K.sum(K.square(reg)))
self.add_loss(reg_norm, x)
return ld
def compute_output_shape(self, input_shape):
return (input_shape[0][0],input_shape[0][1])
def model():
input1 = Input(shape=(224, 224, 3))
input2 = Input(shape=(224, 224, 3))
inp1 = Flatten()(input1)
inp2 = Flatten()(input2)
layer1 = Dense(1024, activation="sigmoid")
x1_1 = layer1(inp1)
x2_1 = layer1(inp2)
layer2 = Dense(1024, activation="sigmoid")
x1_2 = layer2(inp1)
x2_2 = layer2(inp2)
# get weights of layer1 and layer2
layer1_wt = layer1.trainable_weights[0]
layer2_wt = layer2.trainable_weights[0]
# This is a regularization term on the weights of layer1 and layer2.
regularization = CustomRegularization()([layer1_wt, layer2_wt])
model = Model([input1, input2], [x1_2, x2_2, regularization])
if __name__ == "__main__":
m = model()
This returns the error AttributeError: 'Variable' object has no attribute '_keras_history' and is not able to create the model.
I know that this error would be because of incompatible outputs (since inputs are keras Input layer). [For more details refer to #fchollet's comment on issue #7362 ].
The main problem here are the layer1.trainable_weights[0] and layer2.trainable_weights[0]. These are tf.Variable (tensorflow variables) and not Keras Tensors. I would require them to convert to keras tensors. How do I do that?