I am trying to save and restore the weights for a given model in Keras.
I am successful in saving the weights, using model.save_weights(filepath, ...) and also the weights are actually loaded. I can confirm this by saving model.get_weights() to a file, after saving and after restoring, and diffing the files that I receive that way.
However my model is just as bad as it is at the start. Is there anything I am missing?
def __init__(self, **args):
# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
self.model.add(Convolution2D(32, (3, 3), strides=(1, 1)))
self.model.add(Activation('relu'))
self.model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
self.model.add(Activation('relu'))
self.model.add(Convolution2D(64, (3, 3), strides=(1, 1)))
self.model.add(Activation('relu'))
self.model.add(Flatten())
self.model.add(Dense(512))
self.model.add(Activation('relu'))
self.model.add(Dense(self.nb_actions)) #nb_actions))
self.model.add(Activation('linear'))
print(self.model.summary())
if os.path.isfile("/home/abcd/model.weights"):
self.model.load_weights("/home/abcd/model.weights")
self.compile(Adam(lr=.00025), metrics=['mae'])
...
def compile(self, optimizer, metrics=[]):
metrics += [mean_q] # register default metrics
# We never train the target model, hence we can set the optimizer and loss arbitrarily.
self.target_model = clone_model(self.model, self.custom_model_objects)
if os.path.isfile("/home/abcd/target_model.weights"):
self.target_model.load_weights("/home/abcd/target_model.weights")
self.target_model.compile(optimizer='sgd', loss='mse')
self.model.compile(optimizer='sgd', loss='mse')
# Compile model.
if self.target_model_update < 1.:
# We use the `AdditionalUpdatesOptimizer` to efficiently soft-update the target model.
updates = get_soft_target_model_updates(self.target_model, self.model, self.target_model_update)
optimizer = AdditionalUpdatesOptimizer(optimizer, updates)
def clipped_masked_error(args):
y_true, y_pred, mask = args
loss = huber_loss(y_true, y_pred, self.delta_clip)
loss *= mask # apply element-wise mask
return K.sum(loss, axis=-1)
# Create trainable model. The problem is that we need to mask the output since we only
# ever want to update the Q values for a certain action. The way we achieve this is by
# using a custom Lambda layer that computes the loss. This gives us the necessary flexibility
# to mask out certain parameters by passing in multiple inputs to the Lambda layer.
y_pred = self.model.output
y_true = Input(name='y_true', shape=(self.nb_actions,))
mask = Input(name='mask', shape=(self.nb_actions,))
loss_out = Lambda(clipped_masked_error, output_shape=(1,), name='loss')([y_true, y_pred, mask])
ins = [self.model.input] if type(self.model.input) is not list else self.model.input
trainable_model = Model(inputs=ins + [y_true, mask], outputs=[loss_out, y_pred])
assert len(trainable_model.output_names) == 2
combined_metrics = {trainable_model.output_names[1]: metrics}
losses = [
lambda y_true, y_pred: y_pred, # loss is computed in Lambda layer
lambda y_true, y_pred: K.zeros_like(y_pred), # we only include this for the metrics
]
if os.path.isfile("/home/abcd/trainable_model.weights"):
trainable_model.load_weights("/home/abcd/trainable_model.weights")
trainable_model.compile(optimizer=optimizer, loss=losses, metrics=combined_metrics)
self.trainable_model = trainable_model
self.compiled = True
...
def final(self, state):
"Called at the end of each game."
# call the super-class final method
PacmanQAgent.final(self, state)
# did we finish training?
if self.episodesSoFar == self.numTraining:
# you might want to print your weights here for debugging
"*** YOUR CODE HERE ***"
self.training = False
# Save the model
self.model.save_weights("/home/abcd/model.weights", True)
self.trainable_model.save_weights("/home/abcd/trainable_model.weights", True)
self.target_model.save_weights("/home/abcd/target_model.weights", True)
I found the problem. Actually the saving and loading worked fine. I was working with an Annealed Epsilon Greedy Policy, so every time I started training, it would, at the start, do basically random steps only anyway.
In addition, my Testing Code was wrong, so testing did not what it was supposed to do. These two combined made it feel like it would actually learn something (training went well) but didn't save the weights (testing went wrong, next training started from 'random').
Related
I want to develop a lifelong learning system,so i need to prevent important parameter from changing.I read related paper 'Memory Aware Synapses: Learning what (not) to forget',a method was mentioned,I need to calculate the gradient of each parameter conresponding to each input image,so how should i write my code in pytorch?
'Memory Aware Synapses: Learning what (not) to forget'
You can do it using standard optimization procedure and .backward() method on your loss function.
First, scaling as defined in your link:
class Scaler:
def __init__(self, parameters, delta):
self.parameters = parameters
self.delta = delta
def step(self):
"""Multiplies gradients in place."""
for param in self.parameters:
if param.grad is None:
raise ValueError("backward() has to be called before running scaler")
param.grad *= self.delta
One can use it just like optimizer.step(), see below (see comments):
model = torch.nn.Sequential(
torch.nn.Linear(10, 100), torch.nn.ReLU(), torch.nn.Linear(100, 1)
)
scaler = Scaler(model.parameters(), delta=0.001)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.MSELoss()
X, y = torch.randn(64, 10), torch.randn(64)
# Optimization loop
EPOCHS = 10
for _ in range(EPOCHS):
output = model(X)
loss = criterion(output, y)
loss.backward() # Now model has the gradients
optimizer.step() # Optimize model's parameters
print(next(model.parameters()).grad)
scaler.step() # Scaler gradients
optimizer.zero_grad() # Zero gradient before next step
After scaler.step() you will have gradient scaled available inside param.grad for each parameter (just like those are accessed within Scaler's step method) so you can do whatever you want with them.
I am a beginner of Deep Learning and trying to making discriminator that judge cats/non-cats.
But When I run the code following, runtime error occured.
I know that "requires_grad" must be set to True in order to calculate the gradient automatically, but since X_train and Y_train are variables for reading, they are set to False.
I would be grateful if you could modify this code.
X_train = torch.tensor(train_set_x, dtype=dtype,requires_grad=False)
Y_train = torch.tensor(train_set_y, dtype=dtype,requires_grad=False)
def train_model(X_train, Y_train, X_test, Y_test, n_h, num_iterations=10000,learning_rate=0.5, print_cost=False):
"""
Arguments:
X_train -- training set represented by a numpy array of shape (num_px * num_px * 3, m_train)
Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
X_test -- test set represented by a numpy array of shape (num_px * num_px * 3, m_test)
Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
n_h -- size of the hidden layer
num_iterations -- number of iterations in gradient descent loop
learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
print_cost -- if True, print the cost every 200 iterations
Returns:
d -- dictionary containing information about the model.
"""
n_x = X.size(1)
n_y = Y.size(1)
# Create model
model = nn.Sequential(
nn.Linear(n_x,n_h),
nn.ReLU(),
nn.Linear(n_h,n_y),
nn.ReLU()
)
# Initialize parameters
for name, param in model.named_parameters():
if name.find('weight') != -1:
torch.nn.init.orthogonal_(param)
elif name.find('bias') != -1:
torch.nn.init.constant_(param, 0)
# Cost function
cost_fn = nn.BCELoss()
# Loop (gradient descent)
for i in range(0, num_iterations):
# Forward propagation: compute predicted labels by passing input data to the model.
Y_predicted = model(X_train)
A2 = (Y_predicted > 0.5).float()
# Cost function. Inputs: predicted and true values. Outputs: "cost".
cost = cost_fn(A2, Y_train)
# Print the cost every 100 iterations
if print_cost and i % 100 == 0:
print("Cost after iteration %i: %f" % (i, cost.item()))
# Zero the gradients before running the backward pass. See hint in problem description
model.zero_grad()
# Backpropagation. Compute gradient of the cost function with respect to all the
# learnable parameters of the model. Use autograd to compute the backward pass.
cost.backward()
# Gradient descent parameter update.
with torch.no_grad():
for param in model.parameters():
# Your code here !!
param -= learning_rate * param.grad
d = {"model": model,
"learning_rate": learning_rate,
"num_iterations": num_iterations}
return d
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
I believe your problem is that you are mixing numpy arrays and torch tensors. Pytorch tensors are a bit like numpy arrays, but they also kept in a computational graph that is responsible for the backward pass.
The description of your received variables X_train, Y_train, X_test, Y_test says they are numpy arrays. You should convert them all to torch tensors:
x = torch.tensor(x)
I also noticed that you are manually performing gradient updates. Unless that was your intention, I would recomend you using one of pytorch's optimizers.
from torch.optim import SGD
model = nn.Sequential(
nn.Linear(n_x,n_h),
nn.ReLU(),
nn.Linear(n_h,n_y),
nn.Sigmoid() # You are using BCELoss, you should give it an input from 0 to 1
)
optimizer = SGD(model.parameters(), lr=learning_rate)
cost_fn = nn.BCELoss()
optimizer.zero_grad()
y = model(x)
cost = cost_fn(y, target)
cost.backward()
optimizer.step() # << updated the gradients of your model
Notice that it is recomended to use torch.nn.BCEWithLogitsLoss instead of BCELoss. The first implements sigmoid and the binary cross entropy together with some math tricks to make it more numerically stable. Your model should look something like:
model = nn.Sequential(
nn.Linear(n_x,n_h),
nn.ReLU(),
nn.Linear(n_h,n_y)
)
I tried to perform some deep learning application and got a module 'tensorflow' has no attribute 'random_uniform' error. On CPU the code works fine but it is really slow. In order to run the code on GPU i needed to change some definitions. Here is my code below. Any ideas?
def CapsNet(input_shape, n_class, routings):
x = tf.keras.layers.Input(shape=input_shape)
# Layer 1: Just a conventional Conv2D layer
conv1 = tf.keras.layers.Convolution2D(filters=256, kernel_size=9, strides=1, padding='valid', activation='relu', name='conv1')(x)
# Layer 2: Conv2D layer with `squash` activation, then reshape to [None, num_capsule, dim_capsule]
primarycaps = PrimaryCap(conv1, dim_capsule=8, n_channels=32, kernel_size=9, strides=2, padding='valid')
# Layer 3: Capsule layer. Routing algorithm works here.
digitcaps = CapsuleLayer(num_capsule=n_class, dim_capsule=16, routings=routings,
name='digitcaps')(primarycaps)
# Layer 4: This is an auxiliary layer to replace each capsule with its length. Just to match the true label's shape.
# If using tensorflow, this will not be necessary. :)
out_caps = Length(name='capsnet')(digitcaps)
# Decoder network.
y = tf.keras.layers.Input(shape=(n_class,))
masked_by_y = Mask()([digitcaps, y]) # The true label is used to mask the output of capsule layer. For training
masked = Mask()(digitcaps) # Mask using the capsule with maximal length. For prediction
# Shared Decoder model in training and prediction
decoder = tf.keras.models.Sequential(name='decoder')
decoder.add(tf.keras.layers.Dense(512, activation='relu', input_dim=16*n_class))
decoder.add(tf.keras.layers.Dense(1024, activation='relu'))
decoder.add(tf.keras.layers.Dense(np.prod(input_shape), activation='sigmoid'))
decoder.add(tf.keras.layers.Reshape(target_shape=input_shape, name='out_recon'))
# Models for training and evaluation (prediction)
train_model = tf.keras.models.Model([x, y], [out_caps, decoder(masked_by_y)])
eval_model = tf.keras.models.Model(x, [out_caps, decoder(masked)])
# manipulate model
noise = tf.keras.layers.Input(shape=(n_class, 16))
noised_digitcaps = tf.keras.layers.Add()([digitcaps, noise])
masked_noised_y = Mask()([noised_digitcaps, y])
manipulate_model = tf.keras.models.Model([x, y, noise], decoder(masked_noised_y))
return train_model, eval_model, manipulate_model
def margin_loss(y_true, y_pred):
L = y_true * K.square(K.maximum(0., 0.9 - y_pred)) + \
0.5 * (1 - y_true) * K.square(K.maximum(0., y_pred - 0.1))
return K.mean(K.sum(L, 1))
model, eval_model, manipulate_model = CapsNet(input_shape=train_x_temp.shape[1:], n_class=len(np.unique(np.argmax(train_y, 1))), routings=3)
The problem lays with your tenserflow installation. To be exact your python tensorflow library. Make sure you reinstall the package correctly, with anaconda you need to install it with administrator rights.
Or you have the newest version then you need to add like
tf.random.uniform(
See for more information the documentation: https://www.tensorflow.org/api_docs/python/tf/random/uniform
I want to build an autoencoder where each layer in the encoder has the same meaning as a correspondent layer in the decoder. So if the autoencoder is perfectly trained, the values of those layers should be roughly the same.
So lets say the autoencoder consists of e1 -> e2 -> e3 -> d2 -> d1, whereas e1 is the input and d1 the output. A normal autoencoder trains to have the same result in d1 as e1, but I want the additional constraint, that e2 and d2 are the same. Therefore I want an additional backpropagation path which leads from d2 to e2 and trains at the same time as the normal path from d1 to e1. (d stands for decoder, e for encoder).
I tried to use the error between e2 and d2 as a regularization term with the CustomRegularization layer from the first answer of this link https://github.com/keras-team/keras/issues/5563. I also use this for the error between e1 and d1 instead of the normal path.
The following code is written such that more than 1 intermediate layer can be handled and also uses 4 layers.
In the out commented code is a normal autoencoder which only propagates from start to end.
from keras.layers import Dense
import numpy as np
from keras.datasets import mnist
from keras.models import Model
from keras.engine.topology import Layer
from keras import objectives
from keras.layers import Input
import keras
import matplotlib.pyplot as plt
#A layer which can be given as an output to force a regularization term between two layers
class CustomRegularization(Layer):
def __init__(self, **kwargs):
super(CustomRegularization, self).__init__(**kwargs)
def call(self, x, mask=None):
ld=x[0]
rd=x[1]
bce = objectives.binary_crossentropy(ld, rd)
loss2 = keras.backend.sum(bce)
self.add_loss(loss2, x)
return bce
def get_output_shape_for(self, input_shape):
return (input_shape[0][0],1)
def zero_loss(y_true, y_pred):
return keras.backend.zeros_like(y_pred)
#Create regularization layer between two corresponding layers of encoder and decoder
def buildUpDownRegularization(layerNo, input, up_layers, down_layers):
for i in range(0, layerNo):
input = up_layers[i](input)
start = input
for i in range(layerNo, len(up_layers)):
input = up_layers[i](input)
for j in range(0, len(down_layers) - layerNo):
input = down_layers[j](input)
end = input
cr = CustomRegularization()([start, end])
return cr
# Define shape of the network, layers, some hyperparameters and training data
sizes = [784, 400, 200, 100, 50]
up_layers = []
down_layers = []
for i in range(1, len(sizes)):
layer = Dense(units=sizes[i], activation='sigmoid', input_dim=sizes[i-1])
up_layers.append(layer)
for i in range(len(sizes)-2, -1, -1):
layer = Dense(units=sizes[i], activation='sigmoid', input_dim=sizes[i+1])
down_layers.append(layer)
batch_size = 128
num_classes = 10
epochs = 100
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
x_train = x_train.reshape([x_train.shape[0], 28*28])
x_test = x_test.reshape([x_test.shape[0], 28*28])
y_train = x_train
y_test = x_test
optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
"""
### Normal autoencoder like in base mnist example
model = keras.models.Sequential()
for layer in up_layers:
model.add(layer)
for layer in down_layers:
model.add(layer)
model.compile(optimizer=optimizer, loss=keras.backend.binary_crossentropy)
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
score = model.evaluate(x_test, y_test, verbose=0)
#print('Test loss:', score[0])
#print('Test accuracy:', score[1])
decoded_imgs = model.predict(x_test)
n = 10 # how many digits we will display
plt.figure(figsize=(20, 4))
for i in range(n):
# display original
ax = plt.subplot(2, n, i + 1)
plt.imshow(x_test[i].reshape(28, 28))
plt.gray()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
# display reconstruction
ax = plt.subplot(2, n, i + 1 + n)
plt.imshow(decoded_imgs[i].reshape(28, 28))
plt.gray()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
plt.show()
"""
### My autoencoder where each subpart is also an autoencoder
#This part is only because the model needs a path from start to end, contentwise this should do nothing
output = input = Input(shape=(sizes[0],))
for i in range(0, len(up_layers)):
output = up_layers[i](output)
for i in range(0, len(down_layers)):
output = down_layers[i](output)
crs = [output]
losses = [zero_loss]
#Build the regularization layer
for i in range(len(up_layers)):
crs.append(buildUpDownRegularization(i, input, up_layers, down_layers))
losses.append(zero_loss)
#Create and train model with adapted training data
network = Model([input], crs)
optimizer = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
network.compile(loss=losses, optimizer=optimizer)
dummy_train = np.zeros([y_train.shape[0], 1])
dummy_test = np.zeros([y_test.shape[0], 1])
training_data = [y_train]
test_data = [y_test]
for i in range(len(network.outputs)-1):
training_data.append(dummy_train)
test_data.append(dummy_test)
network.fit(x_train, training_data, batch_size=batch_size, epochs=epochs,verbose=1, validation_data=(x_test, test_data))
score = network.evaluate(x_test, test_data, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
decoded_imgs = network.predict(x_test)
n = 10 # how many digits we will display
plt.figure(figsize=(20, 4))
for i in range(n):
# display original
ax = plt.subplot(2, n, i + 1)
plt.imshow(x_test[i].reshape(28, 28))
plt.gray()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
# display reconstruction
ax = plt.subplot(2, n, i + 1 + n)
plt.imshow(decoded_imgs[0][i].reshape(28, 28))
plt.gray()
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
plt.show()
If you run the code as is it will show, that the reproduction ability is no longer given in my code.
I expect a similar behavior to the uncommented code, which shows a normal autoencoder.
Edit: As mentioned in the answers this works well with MSE instead of crossentropy and a lr of .01. 100 epochs with that setting produce really good results.
Edit 2: I would like that the backpropagation works as in this [image] (https://imgur.com/OOo757x). So the backpropagation of the loss of a certain layer stops at the corresponding layer. I think I didn't make this clear before and I don't know if the code currently does that.
Edit 3: Although this code runs and returns a good looking solution the CustomRegularization layer is not doing what I thought it would do, therefore it does not do the same things as in the description.
It seems like the main issue is the use of binary cross-entropy to minimize the difference between encoder and decoder. The internal representation in the network is not going to be a single class probability like the output might be if you were classifying MNIST digits. I was able to get your network to output some reasonable-looking reconstructions with these simple changes:
Using objectives.mean_squared_error instead of objectives.binary_crossentropy in the CustomRegularization class
Changing number of epochs to 5
Changing learning rate to .01
Changes 2 and 3 were simply made to speed up the testing. Change 1 is the key here. Cross entropy is designed for problems where there is a binary "ground truth" variable and an estimate of that variable. However, you do not have a binary truth value in the middle of your network, only at the output layer. Thus a cross entropy loss function in the middle of the network doesn't make much sense (at least to me) -- it will be trying to measure entropy for a variable that isn't binary. Mean squared error, on the other hand, is a bit more generic and should work for this case since you are simply minimizing the difference between two real values. In essence, the middle of the network is performing regression (difference between activations in two continuous values, i.e. layers), not classification, so it needs a loss function that is appropriate for regression.
I also want to suggest that there may be a better approach to accomplish what you want. If you really want the encoder and decoder to be exactly the same, you can share weights between them. Then they will be identical, not just highly similar, and your model will have fewer parameters to train. There is a decent explanation of shared (tied) weights autoencoders with Keras here if you're curious.
Reading your code it does seem like it is doing what you want in your illustration, but I am not really sure how to verify that.
I am trying to create the custom loss function using Keras. I want to compute the loss function based on the input and predicted the output of the neural network.
I tried using the customloss function in Keras. I think y_true is the output that we give for training and y_pred is the predicted output of the neural network. The below loss function is same as "mean_squared_error" loss in Keras.
def customloss(y_true, y_pred):
return K.mean(K.square(y_pred - y_true), axis=-1)
I would like to use the input to the neural network also to compute the custom loss function in addition to mean_squared_error loss. Is there a way to send an input to the neural network as an argument to the customloss function.
Thank you.
I have come across 2 solutions to the question you asked.
You can pass your input (scalar only) as an argument to the custom loss wrapper function.
def custom_loss(i):
def loss(y_true, y_pred):
return K.mean(K.square(y_pred - y_true), axis=-1) + something with i...
return loss
def baseline_model():
# create model
i = Input(shape=(5,))
x = Dense(5, kernel_initializer='glorot_uniform', activation='linear')(i)
o = Dense(1, kernel_initializer='normal', activation='linear')(x)
model = Model(i, o)
model.compile(loss=custom_loss(i), optimizer=Adam(lr=0.0005))
return model
This solution is also mentioned in the accepted answer here
You can pad your label with extra data columns from input and write a custom loss. This is helpful if you just want one/few feature column(s) from your input.
def custom_loss(data, y_pred):
y_true = data[:, 0]
i = data[:, 1]
return K.mean(K.square(y_pred - y_true), axis=-1) + something with i...
def baseline_model():
# create model
i = Input(shape=(5,))
x = Dense(5, kernel_initializer='glorot_uniform', activation='linear')(i)
o = Dense(1, kernel_initializer='normal', activation='linear')(x)
model = Model(i, o)
model.compile(loss=custom_loss, optimizer=Adam(lr=0.0005))
return model
model.fit(X, np.append(Y_true, X[:, 0], axis =1), batch_size = batch_size, epochs=90, shuffle=True, verbose=1)
This solution can be found also here in this thread.
I have only used the 2nd method when I had to use input feature columns in the loss. The first method can be only used with scalar arguments as mentioned in the comments.
You could wrap your custom loss with another function that takes the input tensor as an argument:
def customloss(x):
def loss(y_true, y_pred):
# Use x here as you wish
err = K.mean(K.square(y_pred - y_true), axis=-1)
return err
return loss
And then compile your model as follows:
model.compile('sgd', customloss(x))
where x is your input tensor.
NOTE: Not tested.