Adding parameters to a pretrained model - pytorch

In Pytorch, we load the pretrained model as follows:
net.load_state_dict(torch.load(path)['model_state_dict'])
Then the network structure and the loaded model have to be exactly the same. However, is it possible to load the weights but then modify the network/add an extra parameter?
Note:
If we add an extra parameter to the model earlier before loading the weights, e.g.
self.parameter = Parameter(torch.ones(5),requires_grad=True)
we will get Missing key(s) in state_dict: error when loading the weights.

Let's create a model and save its' state.
class Model1(nn.Module):
def __init__(self):
super(Model1, self).__init__()
self.encoder = nn.LSTM(100, 50)
def forward(self):
pass
model1 = Model1()
torch.save(model1.state_dict(), 'filename.pt') # saving model
Then create a second model which has a few layers common to the first model. Load the states of the first model and load it to the common layers of the second model.
class Model2(nn.Module):
def __init__(self):
super(Model2, self).__init__()
self.encoder = nn.LSTM(100, 50)
self.linear = nn.Linear(50, 200)
def forward(self):
pass
model1_dict = torch.load('filename.pt')
model2 = Model2()
model2_dict = model2.state_dict()
# 1. filter out unnecessary keys
filtered_dict = {k: v for k, v in model1_dict.items() if k in model2_dict}
# 2. overwrite entries in the existing state dict
model2_dict.update(filtered_dict)
# 3. load the new state dict
model2.load_state_dict(model2_dict)

Related

SequenceClassifierOutput has generator as loss instead of a tensor

I'm doing Distillation from a Roberta with an Adapter, I'm following this tutorial
and in the function distill_roberta_weights() I just change teacher_model.config.to_dict()
to student.load_state_dict(teacher.state_dict(), strict=False), so the student model has the adapter too.
But when I am training the distillation using the
DistillationTrainer
from here
I get the following error
Do you have any idea of what is the problem?
The student_output has a loss generator instead the tensor, the part of the cross entropy does not have any problem as it uses the logits from the outputs.
EDIT:
I am adding more information
def distill_weights(teacher, student):
"""
Recursively copies the weights of the (teacher) to the (student).
This function is meant to be first called on a RobertaFor... model, but is then called on every children of that model recursively.
The only part that's not fully copied is the encoder, of which only half is copied.
"""
# If the part is an entire RoBERTa model or a RobertaFor..., unpack and iterate
if isinstance(teacher, RobertaModel) or type(teacher).__name__.startswith('RobertaFor'):
for teacher_part, student_part in zip(teacher.children(), student.children()):
distill_weights(teacher_part, student_part)
# Else if the part is an encoder, copy one out of every layer
elif isinstance(teacher, RobertaEncoder):
teacher_encoding_layers = [layer for layer in next(teacher.children())]
student_encoding_layers = [layer for layer in next(student.children())]
for i in range(len(student_encoding_layers)):
student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2*i].state_dict())
# Else the part is a head or something else, copy the state_dict
else:
student.load_state_dict(teacher.state_dict(), strict=False)
def distill_roberta_based(teacher_model):
"""
Distilates a RoBERTa (teacher_model) like would DistilBERT for a BERT model.
The student model has the same configuration, except for the number of hidden layers, which is // by 2.
The student layers are initilized by copying one out of two layers of the teacher, starting with layer 0.
The head of the teacher is also copied.
"""
# Set student configuration
configuration = teacher_model.config.to_dict()
configuration['num_hidden_layers'] //= 2
configuration = RobertaConfig.from_dict(configuration)
# create student model
student_model = type(teacher_model)(configuration)
distill_weights(teacher=teacher_model, student=student_model)
return student_model
#function for train the Distillated model
class DistillationTrainer(Trainer):
def __init__(self, *args, teacher_model=None, **kwargs):
super().__init__(*args, **kwargs)
self.teacher = teacher_model
# place teacher on same device as student
self._move_model_to_device(self.teacher,self.model.device)
self.teacher.eval()
def compute_loss(self, model, inputs, return_outputs = False) :
"""
The distillation loss for distilating a BERT-like model.
The loss takes the (teacher_logits), (student_logits) and (labels) for various losses.
The (temperature) can be given, otherwise it's set to 1 by default.
"""
outputs_student = model(**inputs)
print(outputs_student)
student_loss = outputs_student.loss
# compute teacher output
with torch.no_grad():
outputs_teacher = self.teacher(**inputs)
# assert size
assert outputs_student.logits.size() == outputs_teacher.logits.size()
# Classification loss (problem-specific loss)
loss_function = CrossEntropyLoss()
# Temperature and sotfmax
student_logits = F.softmax (outputs_student.logits / self.args.temperature, dim=-1)
teacher_logits = F.softmax (outputs_teacher.logits / self.args.temperature, dim=-1)
loss_logits = loss_function(student_logits, teacher_logits)
# Return weighted student loss
loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits
return (loss, outputs_student) if return_outputs else loss
#create the student
student_model_adapter = distill_roberta_based(teacher_model)
#activate adapter
student_model_adapter.set_active_adapters('parallel')
student_model_adapter.train_adapter('parallel')
trainer = DistillationTrainer(
student_model_adapter,
training_args,
teacher_model=teacher_model,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
trainer.args._n_gpu = 4
So, the desired output of outputs_student should be like
SequenceClassifierOutput(loss=tensor([0.6899, 0.6902, 0.6926, 0.6913, 0.6906, 0.6904, 0.6922, 0.6917],
device='cuda:0', grad_fn=<GatherBackward>), logits=tensor([[-1.2512e-03, -9.7885e-03],
[ 6.2714e-03, -5.7755e-03],.....])
But instead the output is
SequenceClassifierOutput(loss=<generator object gather.<locals>.gather_map.<locals>.<genexpr> at 0x7f5bb4fbe9d0>, logits=tensor([[-0.0150, 0.0075],
[-0.0122, 0.0181],...

How keras.utils.Sequence works?

I am trying to create a data pipeline for U-net for Image Segmentation. I came across Keras.utils.Sequence class through which, I can create a data pipeline, But I am unable to understand how this is working.
link for the code Keras code , Source code
def __iter__(self):
"""Create a generator that iterate over the Sequence."""
for item in (self[i] for i in range(len(self))):
yield item
I will highly appreciate if anyone can tell me how this works ?
You don't need a generator. The sequence class is there to manage that. You need to define a class inherited from tensorflow.keras.utils.Sequence and define the methods:
__init__, __getitem__, __len__. In addition, you can define the method on_epoch_end, which is called at the end of each epoch and is usually used to shuffle the sample indexes.
There is an example in the link you gave Tensorflow Sequence.
Below is another example of Sequence.
Note that you can pass the data to the __init__ constructor, but you may as well read the data from files in the __getitem__ method, assuming you know where to read it, e.g. by passing the name of a directory or directories into the constructor. This is necessary if there is a lot of data.
from tensorflow import keras
import numpy as np
class SequenceExample(keras.utils.Sequence):
def __init__(self, x_in, y_in, batch_size, shuffle=True):
# Initialization
self.batch_size = batch_size
self.shuffle = shuffle
self.x = x_in
self.y = y_in
self.datalen = len(y_in)
self.indexes = np.arange(self.datalen)
if self.shuffle:
np.random.shuffle(self.indexes)
def __getitem__(self, index):
# get batch indexes from shuffled indexes
batch_indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
x_batch = self.x[batch_indexes]
y_batch = self.y[batch_indexes]
return x_batch, y_batch
def __len__(self):
# Denotes the number of batches per epoch
return self.datalen // self.batch_size
def on_epoch_end(self):
# Updates indexes after each epoch
self.indexes = np.arange(self.datalen)
if self.shuffle:
np.random.shuffle(self.indexes)

How to restore KerasClassfier?

After saved weights and json configuration of a KerasClassifier model https://github.com/keras-team/keras/blob/master/keras/wrappers/scikit_learn.py I need to restore it and verify results.
But if I restore weight and model then I have a Sequential object, how can I rebuild original KerasClassifier from that??
I'm not sure I understood you correcly, but propose following solution. KerasClassifier inherits from BaseWrapper which has the following __init__ signature:
def __init__(self, build_fn=None, **sk_params):
self.build_fn = build_fn
self.sk_params = sk_params
self.check_params(sk_params)
okay, what's the build_fn and sk_params?
The build_fn should construct, compile and return a Keras model, which
will then be used to fit/predict. One of the following
three values could be passed to build_fn:
1. A function
2. An instance of a class that implements the __call__ method
3. None. This means you implement a class that inherits from either
KerasClassifier or KerasRegressor. The __call__ method of the
present class will then be treated as the default build_fn.
...
sk_params takes both model parameters and fitting parameters. Legal model
parameters are the arguments of build_fn. Note that like all other
estimators in scikit-learn, build_fn should provide default values for
its arguments, so that you could create the estimator without passing any
values to sk_params.
...
some commints are omitted
you can read full comment at this and this links.
As the build_fn expects the function which returns compiled keras model (no matter what it is - Sequential or just Model) - you can pass as value function which returns loaded model.
Edit also you should call fit with some params to restore model using that approach.
load model as build_fn
fit method invokes a build_fn, hence each time you try to train such classifier you will load and then fit loaded clssifier.
For example:
from keras.models import load_model # or another method - but this one is simpliest
from keras.wrappers.scikit_learn import KerasClassifier
def load_model(*args, **kwargs):
"""probably this function expects sk_params, so you can use it in theory"""
path="my_model.hd5"
model = load_model(path)
return model
keras_classifier = KerasClassifier(load_model, sk_params) # use your sk_params
keras_classifier.fit(X_tr, y_tr) # I use slice (1, input_shape) to train
- it will work, as the loaded model almost trained & compiled. But it gives a small shift for your model even if you'll call it with a batch of size 1 and for 1 epoch.
load via build_fn closure
Also you can load the model first (if you wish to provide path easily and it's unnacceptable to hardcode path), then return a function which is "build_fn - acceptable":
def load_model_return_build_fn(path):
model = load_model(path)
def build_fn(*args, **kwars):
"""probably this function expects sk_params"""
return model # defined above
return build_fn
build_fn = load_model_return_build_fn("model.hd5")
keras_classifier = KerasClassifier(build_fn, sk_params) # use your sk_params
keras_classifier.fit(X_tr, y_tr) # I use slice (1, input_shape) to train
assign a model to it's attribute
If you plan just load and use pre-trained model, you can use any to load it, assign to the model attribute and don't call fit.
build_fn = load_model_return_build_fn("model.hd5")
# or the function which realy builds and fits a model
keras_classifier = KerasClassifier(build_fn, sk_params) # use your sk_params
keras_classifier.model = model # assign model here, don't call fit
- that case you set model explicitly to it's attribute. Note that build_fn should be a coorrect one build_fn - otherwise it doesn't pass the self.check_params(sk_params) test.
Inherit from KerasClassifier (not so easy as I've thought)
After all, the best solution I know is inherit from KerasClassifier and add a load and/or from_file method.
class KerasClassifierLoadable(KerasClassifier):
#classmethod
def from_file(cls, path, *args, **kwargs):
keras_classifier = cls(*args, **kwargs)
keras_classifier.model = load_model(path)
outp_shape = keras_classifier.model.layers[-1].output_shape[-1]
if outp_shape > 1:
keras_classifier.classes_ = np.arange(outp_shape, dtype='int32')
else:
raise ValueError("Inconsistent output shape: outp_shape={}".format(outp_shape))
keras_classifier.n_classes_ = len(keras_classifier.classes_)
return keras_classifier
def load(self, path):
self.model = load_model(path)
outp_shape = keras_classifier.model.layers[-1].output_shape[-1]
if outp_shape > 1:
keras_classifier.classes_ = np.arange(outp_shape, dtype='int32')
else:
raise ValueError("Inconsistent output shape: outp_shape={}".format(outp_shape))
self.n_classes_ = len(self.classes_)
here we shoul set self.classes_ to the correct class labels - but I use just an integer values from `range(0, n_classes).
Usage (the build_fn can be any appropiate build_fn):
keras_classifier = KerasClassifierLoadable.from_file("model.hd5", build_fn=build_fn)
keras_classifier = KerasClassifierLoadable(build_fn=build_fn)
keras_classifier.load("model.hd5")
If you have two files model.json and weights.h5, then you can easily load the model and use it as you want.
from keras.models import model_from_json
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights("model.h5")
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

Implementing RNN and LSTM into DQN Pytorch code

I have some troubles finding some example on the great www to how i implement a recurrent neural network with LSTM layer into my current Deep q-network in Pytorch so it become a DRQN.. Bear with me i am just getting started..
Futhermore, I am NOT working with images processing, thereby CNN so do not worry about this. My states are purely temperatures values.
Here is my code that i am currently train my DQN with:
# Importing the libraries
import numpy as np
import random # random samples from different batches (experience replay)
import os # For loading and saving brain
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim # for using stochastic gradient descent
import torch.autograd as autograd # Conversion from tensor (advanced arrays) to avoid all that contains a gradient
# We want to put the tensor into a varaible taht will also contain a
# gradient and to this we need:
from torch.autograd import Variable
# to convert this tensor into a variable containing the tensor and the gradient
# Creating the architecture of the Neural Network
class Network(nn.Module): #inherinting from nn.Module
#Self - refers to the object that will be created from this class
# - self here to specify that we're referring to the object
def __init__(self, input_size, nb_action): #[self,input neuroner, output neuroner]
super(Network, self).__init__() #inorder to use modules in torch.nn
# Input and output neurons
self.input_size = input_size
self.nb_action = nb_action
# Full connection between different layers of NN
# In this example its one input layer, one hidden layer and one output layer
# Using self here to specify that fc1 is a variable of my object
self.fc1 = nn.Linear(input_size, 40)
self.fc2 = nn.Linear(40, 30)
#Example of adding a hiddenlayer
# self.fcX = nn.Linear(30,30)
self.fc3 = nn.Linear(30, nb_action) # 30 neurons in hidden layer
# For function that will activate neurons and perform forward propagation
def forward(self, state):
# rectifier function
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
q_values = self.fc3(x)
return q_values
# Implementing Experience Replay
# We know that RL is based on MDP
# So going from one state(s_t) to the next state(s_t+1)
# We gonna put 100 transition between state into what we call the memory
# So we can use the distribution of experience to make a decision
class ReplayMemory(object):
def __init__(self, capacity):
self.capacity = capacity #100 transitions
self.memory = [] #memory to save transitions
# pushing transitions into memory with append
#event=transition
def push(self, event):
self.memory.append(event)
if len(self.memory) > self.capacity: #memory only contain 100 events
del self.memory[0] #delete first transition from memory if there is more that 100
# taking random sample
def sample(self, batch_size):
#Creating variable that will contain the samples of memory
#zip =reshape function if list = ((1,2,3),(4,5,6)) zip(*list)= (1,4),(2,5),(3,6)
# (state,action,reward),(state,action,reward)
samples = zip(*random.sample(self.memory, batch_size))
#This is to be able to differentiate with respect to a tensor
#and this will then contain the tensor and gradient
#so for state,action and reward we will store the seperately into some
#bytes which each one will get a gradient
#so that eventually we'll be able to differentiate each one of them
return map(lambda x: Variable(torch.cat(x, 0)), samples)
# Implementing Deep Q Learning
class Dqn():
def __init__(self, input_size, nb_action, gamma, lrate, T):
self.gamma = gamma #self.gamma gets assigned to input argument
self.T = T
# Sliding window of the evolving mean of the last 100 events/transitions
self.reward_window = []
#Creating network with network class
self.model = Network(input_size, nb_action)
#creating memory with memory class
#We gonna take 100000 samples into memory and then we will sample from this memory to
#to get a snakk number of random transitions
self.memory = ReplayMemory(100000)
#creating optimizer (stochastic gradient descent)
self.optimizer = optim.Adam(self.model.parameters(), lr = lrate) #learning rate
#input vector which is batch of input observations
#by unsqeeze we create a fake dimension to this is
#what the network expect for its inputs
#have to be the first dimension of the last_state
self.last_state = torch.Tensor(input_size).unsqueeze(0)
#Inilizing
self.last_action = 0
self.last_reward = 0
def select_action(self, state):
#Q value depends on state
#Temperature parameter T will be a positive number and the closer
#it is to ze the less sure the NN will when taking an action
#forexample
#softmax((1,2,3))={0.04,0.11,0.85} ==> softmax((1,2,3)*3)={0,0.02,0.98}
#to deactivate brain then set T=0, thereby it is full random
probs = F.softmax((self.model(Variable(state, volatile = True))*self.T),dim=1) # T=100
#create a random draw from the probability distribution created from softmax
action = probs.multinomial()
print(probs.multinomial())
return action.data[0,0]
# See section 5.3 in AI handbook
def learn(self, batch_state, batch_next_state, batch_reward, batch_action):
outputs = self.model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
#next input for target see page 7 in attached AI handbook
next_outputs = self.model(batch_next_state).detach().max(1)[0]
target = self.gamma*next_outputs + batch_reward
#Using hubble loss inorder to obtain loss
td_loss = F.smooth_l1_loss(outputs, target)
#using lass loss/error to perform stochastic gradient descent and update weights
self.optimizer.zero_grad() #reintialize the optimizer at each iteration of the loop
#This line of code that backward propagates the error into the NN
#td_loss.backward(retain_variables = True) #userwarning
td_loss.backward(retain_graph = True)
#And this line of code uses the optimizer to update the weights
self.optimizer.step()
def update(self, reward, new_signal):
#Updated one transition and we have dated the last element of the transition
#which is the new state
new_state = torch.Tensor(new_signal).float().unsqueeze(0)
self.memory.push((self.last_state, new_state, torch.LongTensor([int(self.last_action)]), torch.Tensor([self.last_reward])))
#After ending in a state its time to play a action
action = self.select_action(new_state)
if len(self.memory.memory) > 100:
batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample(100)
self.learn(batch_state, batch_next_state, batch_reward, batch_action)
self.last_action = action
self.last_state = new_state
self.last_reward = reward
self.reward_window.append(reward)
if len(self.reward_window) > 1000:
del self.reward_window[0]
return action
def score(self):
return sum(self.reward_window)/(len(self.reward_window)+1.)
def save(self):
torch.save({'state_dict': self.model.state_dict(),
'optimizer' : self.optimizer.state_dict(),
}, 'last_brain.pth')
def load(self):
if os.path.isfile('last_brain.pth'):
print("=> loading checkpoint... ")
checkpoint = torch.load('last_brain.pth')
self.model.load_state_dict(checkpoint['state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer'])
print("done !")
else:
print("no checkpoint found...")
I hope there is someone out there that can help me and could implement a RNN and a LSTM layer into my code! I believe in you stackflow!
Best regards Søren Koch
From my point of view, I think you could add RNN, LSTM layer to the Network#__init__,Network#forward; shape of data should be reshaped into sequences...
For more detail, I think you should read these two following articles; after that implementing RNN, LSTM not hard as it seem to be.
http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#sphx-glr-beginner-nlp-sequence-models-tutorial-py
http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

KeyError: 'unexpected key "module.encoder.embedding.weight" in state_dict'

I am getting the following error while trying to load a saved model.
KeyError: 'unexpected key "module.encoder.embedding.weight" in state_dict'
This is the function I am using to load a saved model.
def load_model_states(model, tag):
"""Load a previously saved model states."""
filename = os.path.join(args.save_path, tag)
with open(filename, 'rb') as f:
model.load_state_dict(torch.load(f))
The model is a sequence-to-sequence network whose init function (constructor) is given below.
def __init__(self, dictionary, embedding_index, max_sent_length, args):
""""Constructor of the class."""
super(Sequence2Sequence, self).__init__()
self.dictionary = dictionary
self.embedding_index = embedding_index
self.config = args
self.encoder = Encoder(len(self.dictionary), self.config)
self.decoder = AttentionDecoder(len(self.dictionary), max_sent_length, self.config)
self.criterion = nn.NLLLoss() # Negative log-likelihood loss
# Initializing the weight parameters for the embedding layer in the encoder.
self.encoder.init_embedding_weights(self.dictionary, self.embedding_index, self.config.emsize)
When I print the model (sequence-to-sequence network), I get the following.
Sequence2Sequence (
(encoder): Encoder (
(drop): Dropout (p = 0.25)
(embedding): Embedding(43723, 300)
(rnn): LSTM(300, 300, batch_first=True, dropout=0.25)
)
(decoder): AttentionDecoder (
(embedding): Embedding(43723, 300)
(attn): Linear (600 -> 12)
(attn_combine): Linear (600 -> 300)
(drop): Dropout (p = 0.25)
(out): Linear (300 -> 43723)
(rnn): LSTM(300, 300, batch_first=True, dropout=0.25)
)
(criterion): NLLLoss (
)
)
So, module.encoder.embedding is an embedding layer, and module.encoder.embedding.weight represents the associated weight matrix. So, why it says- unexpected key "module.encoder.embedding.weight" in state_dict?
I solved the problem. Actually I was saving the model using nn.DataParallel, which stores the model in module, and then I was trying to load it without DataParallel. So, either I need to add a nn.DataParallel temporarily in my network for loading purposes, or I can load the weights file, create a new ordered dict without the module prefix, and load it back.
The second workaround looks like the following.
# original saved file with DataParallel
state_dict = torch.load('myfile.pth.tar')
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = k[7:] # remove `module.`
new_state_dict[name] = v
# load params
model.load_state_dict(new_state_dict)
Reference: https://discuss.pytorch.org/t/solved-keyerror-unexpected-key-module-encoder-embedding-weight-in-state-dict/1686

Resources