I am getting the following error while trying to load a saved model.
KeyError: 'unexpected key "module.encoder.embedding.weight" in state_dict'
This is the function I am using to load a saved model.
def load_model_states(model, tag):
"""Load a previously saved model states."""
filename = os.path.join(args.save_path, tag)
with open(filename, 'rb') as f:
model.load_state_dict(torch.load(f))
The model is a sequence-to-sequence network whose init function (constructor) is given below.
def __init__(self, dictionary, embedding_index, max_sent_length, args):
""""Constructor of the class."""
super(Sequence2Sequence, self).__init__()
self.dictionary = dictionary
self.embedding_index = embedding_index
self.config = args
self.encoder = Encoder(len(self.dictionary), self.config)
self.decoder = AttentionDecoder(len(self.dictionary), max_sent_length, self.config)
self.criterion = nn.NLLLoss() # Negative log-likelihood loss
# Initializing the weight parameters for the embedding layer in the encoder.
self.encoder.init_embedding_weights(self.dictionary, self.embedding_index, self.config.emsize)
When I print the model (sequence-to-sequence network), I get the following.
Sequence2Sequence (
(encoder): Encoder (
(drop): Dropout (p = 0.25)
(embedding): Embedding(43723, 300)
(rnn): LSTM(300, 300, batch_first=True, dropout=0.25)
)
(decoder): AttentionDecoder (
(embedding): Embedding(43723, 300)
(attn): Linear (600 -> 12)
(attn_combine): Linear (600 -> 300)
(drop): Dropout (p = 0.25)
(out): Linear (300 -> 43723)
(rnn): LSTM(300, 300, batch_first=True, dropout=0.25)
)
(criterion): NLLLoss (
)
)
So, module.encoder.embedding is an embedding layer, and module.encoder.embedding.weight represents the associated weight matrix. So, why it says- unexpected key "module.encoder.embedding.weight" in state_dict?
I solved the problem. Actually I was saving the model using nn.DataParallel, which stores the model in module, and then I was trying to load it without DataParallel. So, either I need to add a nn.DataParallel temporarily in my network for loading purposes, or I can load the weights file, create a new ordered dict without the module prefix, and load it back.
The second workaround looks like the following.
# original saved file with DataParallel
state_dict = torch.load('myfile.pth.tar')
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = k[7:] # remove `module.`
new_state_dict[name] = v
# load params
model.load_state_dict(new_state_dict)
Reference: https://discuss.pytorch.org/t/solved-keyerror-unexpected-key-module-encoder-embedding-weight-in-state-dict/1686
Related
I am trying to create a data pipeline for U-net for Image Segmentation. I came across Keras.utils.Sequence class through which, I can create a data pipeline, But I am unable to understand how this is working.
link for the code Keras code , Source code
def __iter__(self):
"""Create a generator that iterate over the Sequence."""
for item in (self[i] for i in range(len(self))):
yield item
I will highly appreciate if anyone can tell me how this works ?
You don't need a generator. The sequence class is there to manage that. You need to define a class inherited from tensorflow.keras.utils.Sequence and define the methods:
__init__, __getitem__, __len__. In addition, you can define the method on_epoch_end, which is called at the end of each epoch and is usually used to shuffle the sample indexes.
There is an example in the link you gave Tensorflow Sequence.
Below is another example of Sequence.
Note that you can pass the data to the __init__ constructor, but you may as well read the data from files in the __getitem__ method, assuming you know where to read it, e.g. by passing the name of a directory or directories into the constructor. This is necessary if there is a lot of data.
from tensorflow import keras
import numpy as np
class SequenceExample(keras.utils.Sequence):
def __init__(self, x_in, y_in, batch_size, shuffle=True):
# Initialization
self.batch_size = batch_size
self.shuffle = shuffle
self.x = x_in
self.y = y_in
self.datalen = len(y_in)
self.indexes = np.arange(self.datalen)
if self.shuffle:
np.random.shuffle(self.indexes)
def __getitem__(self, index):
# get batch indexes from shuffled indexes
batch_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
x_batch = self.x[batch_indexes]
y_batch = self.y[batch_indexes]
return x_batch, y_batch
def __len__(self):
# Denotes the number of batches per epoch
return self.datalen // self.batch_size
def on_epoch_end(self):
# Updates indexes after each epoch
self.indexes = np.arange(self.datalen)
if self.shuffle:
np.random.shuffle(self.indexes)
I need to visualize the output of Vgg16 model which classify 14 different classes.
I load the trained model and I did replace the classifier layer with the identity() layer but it doesn't categorize the output.
Here is the snippet:
the number of samples here is 1000 images.
epoch = 800
PATH = 'vgg16_epoch{}.pth'.format(epoch)
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
class Identity(nn.Module):
def __init__(self):
super(Identity, self).__init__()
def forward(self, x):
return x
model.classifier._modules['6'] = Identity()
model.eval()
logits_list = numpy.empty((0,4096))
targets = []
with torch.no_grad():
for step, (t_image, target, classess, image_path) in enumerate(test_loader):
t_image = t_image.cuda()
target = target.cuda()
target = target.data.cpu().numpy()
targets.append(target)
logits = model(t_image)
print(logits.shape)
logits = logits.data.cpu().numpy()
print(logits.shape)
logits_list = numpy.append(logits_list, logits, axis=0)
print(logits_list.shape)
tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=1000)
tsne_results = tsne.fit_transform(logits_list)
target_ids = range(len(targets))
plt.scatter(tsne_results[:,0],tsne_results[:,1],c = target_ids ,cmap=plt.cm.get_cmap("jet", 14))
plt.colorbar(ticks=range(14))
plt.legend()
plt.show()
here is what this script has been produced: I am not sure why I have all colors for each cluster!
The VGG16 outputs over 25k features to the classifier. I believe it's too much to t-SNE. It's a good idea to include a new nn.Linear layer to reduce this number. So, t-SNE may work better. In addition, I'd recommend you two different ways to get the features from the model:
The best way to get it regardless of the model is by using the register_forward_hook method. You may find a notebook here with an example.
If you don't want to use the register, I'd suggest this one. After loading your model, you may use the following class to extract the features:
class FeatNet (nn.Module):
def __init__(self, vgg):
super(FeatNet, self).__init__()
self.features = nn.Sequential(*list(vgg.children())[:-1]))
def forward(self, img):
return self.features(img)
Now, you just need to call FeatNet(img) to get the features.
To include the feature reducer, as I suggested before, you need to retrain your model doing something like:
class FeatNet (nn.Module):
def __init__(self, vgg):
super(FeatNet, self).__init__()
self.features = nn.Sequential(*list(vgg.children())[:-1]))
self.feat_reducer = nn.Sequential(
nn.Linear(25088, 1024),
nn.BatchNorm1d(1024),
nn.ReLU()
)
self.classifier = nn.Linear(1024, 14)
def forward(self, img):
x = self.features(img)
x_r = self.feat_reducer(x)
return self.classifier(x_r)
Then, you can run your model returning x_r, that is, the reduced features. As I told you, 25k features are too much for t-SNE. Another method to reduce this number is by using PCA instead of nn.Linear. In this case, you send the 25k features to PCA and then train t-SNE using the PCA's output. I prefer using nn.Linear, but you need to test to check which one you get a better result.
I have a Keras model defined in the following manner (Tried to keep only the necessary parts):
temperature = 5.0
def knowledge_distillation_loss(y_true, y_pred, lambda_const):
y_true, logits = y_true[:, :10], y_true[:, 10:]
y_soft = K.softmax(logits/temperature)
y_pred, y_pred_soft = y_pred[:, :10], y_pred[:, 10:]
return lambda_const*logloss(y_true, y_pred) + logloss(y_soft, y_pred_soft)
def get_model(num_labels):
#Some layers for model
model.add(Dense(num_labels))
logits = model.layers[-1].output
probabilities = Activation('softmax')(logits)
# softed probabilities
logits_T = Lambda(lambda x: x/temperature)(logits)
probabilities_T = Activation('softmax')(logits_T)
output = concatenate([probabilities, probabilities_T])
model = Model(model.input, output)
lambda_const = 0.07
model.compile(
optimizer=optimizers.SGD(lr=1e-1, momentum=0.9, nesterov=True),
loss=lambda y_true, y_pred: knowledge_distillation_loss(y_true, y_pred, lambda_const),
metrics=[accuracy])
return model
I am following this reference.
This is implemented using fit generator() on Keras with tf backend. Obviously, I will have trouble when loading the model since temperature is hared coded.
Also,
I wish to update temperature parameter with respect to the epoch number in both loss function and model.
How do I define such a control signal?
I've turned this into a complete example of one way to do this.
You could make a class for the loss function.
class TemperatureLossFunction:
def __init__(self, temperature):
self.temperature = temperature
def loss_fun(self, y_truth, y_pred):
return self.temperature*keras.losses.mse(y_truth, y_pred)
def setTemperature(self, t, session=None):
if session:
session.run(self.temperature.assign( t )
elif tensorflow.get_default_session():
tensorflow.get_default_session().run(self.temperature.assign( t ))
class TemperatureLossCallback(keras.callbacks.Callback):
def __init__(self, temp_lf):
self.temp_lf = temp_lf
def on_epoch_end(self, epoch, params):
self.temp_lf.setTemperature(epoch)
I've created two methods for working with this, the first method creates and saves the model.
def init(session):
global temperature #global for serialization issues
temperature = tensorflow.Variable(5.0)
tlo = TemperatureLossFunction(temperature)
inp = keras.layers.Input((4,4))
l1 = keras.layers.Lambda( lambda x: temperature*x )
op = l1(inp)
m = keras.models.Model(inputs=[inp], outputs=[op])
m.compile( optimizer = keras.optimizers.SGD(0.01), loss=tlo.loss_fun)
#make sure the session is the one your using!
session.run(temperature.initializer)
The first test I run makes sure we are changing the value.
m.evaluate( numpy.ones((1, 4, 4)), numpy.zeros((1, 4, 4)) )
session.run(temperature.assign(1))
m.evaluate( numpy.ones((1, 4, 4)), numpy.zeros((1, 4, 4)) )
The second test I run makes sure we can change the values with a callback.
cb = TemperatureLossCallback(tlo)
def gen():
for i in range(10):
yield numpy.ones((1, 4, 4)), numpy.zeros((1, 4, 4))
m.fit_generator(
gen(), steps_per_epoch=1, epochs=10, callbacks=[cb]
)
m.save("junk.h5")
Finally, to demonstrate reloading the file.
def restart(session):
global temperature
temperature = tensorflow.Variable(5.0)
tlo = TemperatureLossFunction(temperature)
loss_fun = tlo.loss_fun
m = keras.models.load_model(
"junk.h5",
custom_objects = {"loss_fun":tlo.loss_fun}
)
session.run(temperature.initializer)
m.evaluate( numpy.ones((1, 4, 4)), numpy.zeros((1, 4, 4)) )
session.run(temperature.assign(1))
m.evaluate( numpy.ones( (1, 4, 4) ), numpy.zeros( ( 1, 4, 4) ) )
This is just the code I use to start the program for completeness
import sys
if __name__=="__main__":
sess = tensorflow.Session()
with sess.as_default():
if "restart" in sys.argv:
restart(sess)
else:
init(sess)
One downside of this method, if you run this you will see that the temperature variable does not get loaded from the model file. It takes on the value assigned in the code.
On the plus side, both the loss function and the layer are referencing the same Variable
One way I found to save the variable value is to create a new layer and use the variable as the weight for the new layer.
class VLayer(keras.layers.Layer):
def __init__(self, *args, **kwargs):
super().__init__(**kwargs)
def build(self, input_shape):
self.v1 = self.add_weight(
dtype="float32",
shape = (),
trainable=False,
initializer="zeros"
)
def call(self, x):
return x*self.v1
def setValue(self, val):
self.set_weights( numpy.array([val]) )
Now when you load the model, the weight will be loaded. Unfortunately, I could not find a way to link the weight to a Variable on load. So there will be two variables, one for the loss function and one for the layer. Both of them can be set from a callback though. So I feel this method is on a more robust path.
In Pytorch, we load the pretrained model as follows:
net.load_state_dict(torch.load(path)['model_state_dict'])
Then the network structure and the loaded model have to be exactly the same. However, is it possible to load the weights but then modify the network/add an extra parameter?
Note:
If we add an extra parameter to the model earlier before loading the weights, e.g.
self.parameter = Parameter(torch.ones(5),requires_grad=True)
we will get Missing key(s) in state_dict: error when loading the weights.
Let's create a model and save its' state.
class Model1(nn.Module):
def __init__(self):
super(Model1, self).__init__()
self.encoder = nn.LSTM(100, 50)
def forward(self):
pass
model1 = Model1()
torch.save(model1.state_dict(), 'filename.pt') # saving model
Then create a second model which has a few layers common to the first model. Load the states of the first model and load it to the common layers of the second model.
class Model2(nn.Module):
def __init__(self):
super(Model2, self).__init__()
self.encoder = nn.LSTM(100, 50)
self.linear = nn.Linear(50, 200)
def forward(self):
pass
model1_dict = torch.load('filename.pt')
model2 = Model2()
model2_dict = model2.state_dict()
# 1. filter out unnecessary keys
filtered_dict = {k: v for k, v in model1_dict.items() if k in model2_dict}
# 2. overwrite entries in the existing state dict
model2_dict.update(filtered_dict)
# 3. load the new state dict
model2.load_state_dict(model2_dict)
I have some troubles finding some example on the great www to how i implement a recurrent neural network with LSTM layer into my current Deep q-network in Pytorch so it become a DRQN.. Bear with me i am just getting started..
Futhermore, I am NOT working with images processing, thereby CNN so do not worry about this. My states are purely temperatures values.
Here is my code that i am currently train my DQN with:
# Importing the libraries
import numpy as np
import random # random samples from different batches (experience replay)
import os # For loading and saving brain
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim # for using stochastic gradient descent
import torch.autograd as autograd # Conversion from tensor (advanced arrays) to avoid all that contains a gradient
# We want to put the tensor into a varaible taht will also contain a
# gradient and to this we need:
from torch.autograd import Variable
# to convert this tensor into a variable containing the tensor and the gradient
# Creating the architecture of the Neural Network
class Network(nn.Module): #inherinting from nn.Module
#Self - refers to the object that will be created from this class
# - self here to specify that we're referring to the object
def __init__(self, input_size, nb_action): #[self,input neuroner, output neuroner]
super(Network, self).__init__() #inorder to use modules in torch.nn
# Input and output neurons
self.input_size = input_size
self.nb_action = nb_action
# Full connection between different layers of NN
# In this example its one input layer, one hidden layer and one output layer
# Using self here to specify that fc1 is a variable of my object
self.fc1 = nn.Linear(input_size, 40)
self.fc2 = nn.Linear(40, 30)
#Example of adding a hiddenlayer
# self.fcX = nn.Linear(30,30)
self.fc3 = nn.Linear(30, nb_action) # 30 neurons in hidden layer
# For function that will activate neurons and perform forward propagation
def forward(self, state):
# rectifier function
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
q_values = self.fc3(x)
return q_values
# Implementing Experience Replay
# We know that RL is based on MDP
# So going from one state(s_t) to the next state(s_t+1)
# We gonna put 100 transition between state into what we call the memory
# So we can use the distribution of experience to make a decision
class ReplayMemory(object):
def __init__(self, capacity):
self.capacity = capacity #100 transitions
self.memory = [] #memory to save transitions
# pushing transitions into memory with append
#event=transition
def push(self, event):
self.memory.append(event)
if len(self.memory) > self.capacity: #memory only contain 100 events
del self.memory[0] #delete first transition from memory if there is more that 100
# taking random sample
def sample(self, batch_size):
#Creating variable that will contain the samples of memory
#zip =reshape function if list = ((1,2,3),(4,5,6)) zip(*list)= (1,4),(2,5),(3,6)
# (state,action,reward),(state,action,reward)
samples = zip(*random.sample(self.memory, batch_size))
#This is to be able to differentiate with respect to a tensor
#and this will then contain the tensor and gradient
#so for state,action and reward we will store the seperately into some
#bytes which each one will get a gradient
#so that eventually we'll be able to differentiate each one of them
return map(lambda x: Variable(torch.cat(x, 0)), samples)
# Implementing Deep Q Learning
class Dqn():
def __init__(self, input_size, nb_action, gamma, lrate, T):
self.gamma = gamma #self.gamma gets assigned to input argument
self.T = T
# Sliding window of the evolving mean of the last 100 events/transitions
self.reward_window = []
#Creating network with network class
self.model = Network(input_size, nb_action)
#creating memory with memory class
#We gonna take 100000 samples into memory and then we will sample from this memory to
#to get a snakk number of random transitions
self.memory = ReplayMemory(100000)
#creating optimizer (stochastic gradient descent)
self.optimizer = optim.Adam(self.model.parameters(), lr = lrate) #learning rate
#input vector which is batch of input observations
#by unsqeeze we create a fake dimension to this is
#what the network expect for its inputs
#have to be the first dimension of the last_state
self.last_state = torch.Tensor(input_size).unsqueeze(0)
#Inilizing
self.last_action = 0
self.last_reward = 0
def select_action(self, state):
#Q value depends on state
#Temperature parameter T will be a positive number and the closer
#it is to ze the less sure the NN will when taking an action
#forexample
#softmax((1,2,3))={0.04,0.11,0.85} ==> softmax((1,2,3)*3)={0,0.02,0.98}
#to deactivate brain then set T=0, thereby it is full random
probs = F.softmax((self.model(Variable(state, volatile = True))*self.T),dim=1) # T=100
#create a random draw from the probability distribution created from softmax
action = probs.multinomial()
print(probs.multinomial())
return action.data[0,0]
# See section 5.3 in AI handbook
def learn(self, batch_state, batch_next_state, batch_reward, batch_action):
outputs = self.model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
#next input for target see page 7 in attached AI handbook
next_outputs = self.model(batch_next_state).detach().max(1)[0]
target = self.gamma*next_outputs + batch_reward
#Using hubble loss inorder to obtain loss
td_loss = F.smooth_l1_loss(outputs, target)
#using lass loss/error to perform stochastic gradient descent and update weights
self.optimizer.zero_grad() #reintialize the optimizer at each iteration of the loop
#This line of code that backward propagates the error into the NN
#td_loss.backward(retain_variables = True) #userwarning
td_loss.backward(retain_graph = True)
#And this line of code uses the optimizer to update the weights
self.optimizer.step()
def update(self, reward, new_signal):
#Updated one transition and we have dated the last element of the transition
#which is the new state
new_state = torch.Tensor(new_signal).float().unsqueeze(0)
self.memory.push((self.last_state, new_state, torch.LongTensor([int(self.last_action)]), torch.Tensor([self.last_reward])))
#After ending in a state its time to play a action
action = self.select_action(new_state)
if len(self.memory.memory) > 100:
batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample(100)
self.learn(batch_state, batch_next_state, batch_reward, batch_action)
self.last_action = action
self.last_state = new_state
self.last_reward = reward
self.reward_window.append(reward)
if len(self.reward_window) > 1000:
del self.reward_window[0]
return action
def score(self):
return sum(self.reward_window)/(len(self.reward_window)+1.)
def save(self):
torch.save({'state_dict': self.model.state_dict(),
'optimizer' : self.optimizer.state_dict(),
}, 'last_brain.pth')
def load(self):
if os.path.isfile('last_brain.pth'):
print("=> loading checkpoint... ")
checkpoint = torch.load('last_brain.pth')
self.model.load_state_dict(checkpoint['state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer'])
print("done !")
else:
print("no checkpoint found...")
I hope there is someone out there that can help me and could implement a RNN and a LSTM layer into my code! I believe in you stackflow!
Best regards Søren Koch
From my point of view, I think you could add RNN, LSTM layer to the Network#__init__,Network#forward; shape of data should be reshaped into sequences...
For more detail, I think you should read these two following articles; after that implementing RNN, LSTM not hard as it seem to be.
http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#sphx-glr-beginner-nlp-sequence-models-tutorial-py
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html