Some parameters are not getting saved when saving a model in pytorch - pytorch

I have built an encoder-decoder model with attention for morph inflection generation. I am able to train the model and predict on test data but I am getting wrong predicting after loading a saved model
I am not getting any error during saving or loading but
When I load a saved model its predictions are completely wrong. It looks like some parameters are not getting saved.
I have tried to load and save the model using both techniques
using state_dict() eg. torch.save(encoder.state_dict(),'path')
saving complete model eg.torch.save(encoder,'path')
I have tried to save different classes one by one and also making a superclass that initiates all those class and then saving just superclass
but nothing seems to be working
Encoder class
class Encoder(nn.Module):
def __init__(self,vocab_size,embedding_size, encoder_hid_dem,decoder_hid_dem,bidirectional,dropout):
super().__init__()
self.encoder_hid_dem = encoder_hid_dem
self.encoder_n_direction=1;
self.bias = False
self.dropout=dropout
if(bidirectional==True):
self.encoder_n_direction=2;
self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, padding_idx=0)
self.GRU_layer = nn.GRU(input_size=embedding_size, hidden_size=encoder_hid_dem, batch_first=True, bidirectional=bidirectional)
self.fc = nn.Linear(encoder_hid_dem*self.encoder_n_direction,decoder_hid_dem)
self.dropout = nn.Dropout(dropout)
def forward(self, input_word):
# print(input_word.size())
#[batch_size src_sent_lent]
embed_out = self.embedding_layer(input_word)
#[BATCH_SIZE src_sent_lent embedding_dim]
embed_out = F.relu(embed_out)
embed_out = self.dropout(embed_out)
self.batch = embed_out.size()[0]
# hidden = self.init_hidden()
GRU_out,hidden = self.GRU_layer(embed_out)
# print(GRU_out.size())
# print(hidd.size())
#[BATCH_SIZE sec_sent_len n_direction*hid_dem]
#[n_layer*n_direction batch_size hid_dem]
#where the first hid_dim elements in the third axis are the hidden states from the top layer forward RNN, and the last hid_dim elements are hidden states from the top layer backward RNN
#hidden is stacked [forward_1, backward_1, forward_2, backward_2, ...]
#hidden [-2, :, : ] is the last of the forwards RNN
#hidden [-1, :, : ] is the last of the backwards RNN
GRU_out = F.relu(GRU_out)
hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))
# print(GRU_out.size())
# print(hidden.size())
#outputs = [batch_size src sent len, encoder_hid_dim * n_direction]
#hidden = [batch size, dec hid dim]
return GRU_out,hidden
def init_hidden(self):
return (Variable(torch.eye(1, self.encoder_hid_dem)).unsqueeze(1).repeat(2, self.batch, 1).to(self.device))
Attention class
class Attention(nn.Module):
def __init__(self,encoder_hid_dem,decoder_hid_dem,bidirectional):
super().__init__()
self.enc_hid_dim = encoder_hid_dem
self.dec_hid_dim = decoder_hid_dem
self.encoder_n_direction=1;
if(bidirectional==True):
self.encoder_n_direction=2;
self.attn = nn.Linear((encoder_hid_dem * self.encoder_n_direction) + decoder_hid_dem, decoder_hid_dem)
self.v = nn.Parameter(torch.rand(decoder_hid_dem))
def forward(self, hidden, encoder_outputs):
#hidden = [batch size, dec hid dim]
#encoder_outputs = [batch_size ,src sent len, enc hid dim * encoder_n_direction]
batch_size = encoder_outputs.shape[0]
src_len = encoder_outputs.shape[1]
hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
#hidden = [batch size, src sent len, dec hid dim]
#encoder_outputs = [batch size, src sent len, enc hid dim * encoder_n_direction]
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
#energy = [batch size, src sent len, dec hid dim]
energy = energy.permute(0, 2, 1)
#energy = [batch size, dec hid dim, src sent len]
#v = [dec hid dim]
v = self.v.repeat(batch_size, 1).unsqueeze(1)
#v = [batch size, 1, dec hid dim]
attention = torch.bmm(v, energy).squeeze(1)
#attention= [batch size, src len]
return F.softmax(attention, dim=1)
Decoder class
class Decoder(nn.Module):
def __init__(self, decoder_hid_dem, encoder_hid_dem, vocab_size,embedding_dim,attention,decoder_input_size,linear_input_size,bidirectional,dropout):
super().__init__()
self.encoder_hid_dem=encoder_hid_dem
self.decoder_hid_dem=decoder_hid_dem
self.attention=attention
self.dropout = dropout
self.output_dim = vocab_size
self.decoder_n_direction=1;
if(bidirectional==True):
self.decoder_n_direction=2;
self.GRU_layer_out = nn.GRU(decoder_input_size,decoder_hid_dem)
self.out_layer = nn.Linear(in_features=linear_input_size, out_features=vocab_size)
self.dropout = nn.Dropout(dropout)
#self.GRU_layer_out.bias = torch.nn.Parameter(torch.zeros(decoder_input_size))
def forward(self, feature, hidden,actual_word,encoder_outputs):
feature = feature.unsqueeze(1)
# print('decoder')
# print(feature.size())
#[batch_size src_sent_lent=1 feat_size=6]
# print(hidden.size())
# [batch_size dec_hid_dim]
# print(actual_word.size())
# [batch_size src_sent_lent=1 embedding_dim]
# print(encoder_outputs.size())
# outputs = [batch_size src sent len, encoder_hid_dim * encoder_n_directional]
a = self.attention(hidden,encoder_outputs)
# print(a.size())
# [batch_size src_sent_len]
a = a.unsqueeze(1)
#a = [batch size, 1, src len]
weighted = torch.bmm(a,encoder_outputs)
# print(weighted.size())
# weighted = [batch size, 1, enc_hid_dim * encoder_n_direction]
# if len(actual_word.size()) != 0:
input_char = torch.cat((actual_word,feature,weighted),2)
# else:
# input_char = torch.cat((feature,weighted),2)
input_char=input_char.permute(1,0,2)
# print(input_char.size())
# [1 BATCH_SIZE decoder_input_size]
hidden = hidden.unsqueeze(0)
# print(hidden.size())
#[1 batch_size decoder_hid_dem]
output, hidden = self.GRU_layer_out(input_char, hidden)
# print(output.size())
# [sent_len=1 batch_size decoder_n_direction*decoder_hid_dem]
# print(hidden.size())
# [n_layer*n_direction BATCH_SIZE hid_dem]
output = F.leaky_relu(output)
output = self.dropout(output)
output = torch.cat((output.squeeze(0),weighted.squeeze(1),actual_word.squeeze(1)),dim=1)
pre_out = self.out_layer(output)
predicted_output = F.log_softmax(pre_out, dim=1)
# print(predicted_output.size())
# [ batch_size vacab_size ]
return predicted_output, hidden.squeeze(0)
def init_hidden(self, batch):
return (Variable(torch.eye(1, self.decoder_hid_dem)).unsqueeze(1).repeat(1, batch, 1).to(self.device),Variable(torch.eye(1, self.decoder_hid_dem)).unsqueeze(1).repeat(1, batch, 1).to(self.device))
seq2seq class
class Seq2Seq(nn.Module):
def __init__(self,encoder,decoder,device):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
def forward(self,input_word,output_word,features_word,teaching_forcing_ratio,limit):
#print(input_word)
#print(input_word.size())
input_word = input_word.to(self.device)
output_word = output_word.to(self.device)
features_word = features_word.to(self.device)
batch_size= input_word.size()[0]
if(limit==0):
max_len = input_word.size()[1]
else:
max_len = limit
vocabsize = self.decoder.output_dim
actual_word = self.encoder.embedding_layer(torch.tensor(char_to_index['<sos>']).view(1, -1).to(self.device)).repeat(batch_size, 1, 1)
encoder_outputs,hidden = self.encoder(input_word)
features=features_word[:,:]
predicted_word = torch.zeros(max_len,batch_size,vocabsize).to(self.device)
for t in range(1,max_len):
output,hidden=self.decoder(features, hidden,actual_word,encoder_outputs)
#print(output.size())
predicted_word[t] = output
topv, topi = output.topk(1)
bs = topi.size()[0]
temp2 = torch.zeros(0,1,300).to(self.device)
for row in range(bs):
index = topi[row][0].item()
temp = self.encoder.embedding_layer(torch.tensor(index).view(1, -1).to(self.device))
temp2 = torch.cat((temp2,temp))
teacher_force = random.random() < teaching_forcing_ratio
if teacher_force == 1:
actual_word = self.encoder.embedding_layer(output_word[:,t]).unsqueeze(1)
else:
actual_word = temp2
return predicted_word
and this code is used to save and load model
torch.save(model.state_dict(), 'model.pt')
model.load_state_dict(torch.load('model.pt'))
I want that when I run my model on pre-trained weights, it should predict correctly acc to those weights

Your provided code for saving/loading parameters is wrong. The loading and saving model parameters are pretty straight-forward. In your case, it should be:
# loading
saved_params = torch.load(
filename, map_location=lambda storage, loc: storage
)
s2s.load_state_dict(saved_params)
# saving
params = s2s.state_dict()
torch.save(params, filename)
[Update]
You need to make the Seq2Seq class a derived class of PyTorch's nn.Module just like your encoder/decoder classes. Otherwise, you can't use the state_dict() method. You can assume Seq2Seq class is like a container that contains your whole network, although it does not have any learnable weights itself.

Related

nn.CrossEntropyLoss attribute Error while trying to develop video to caption generator in pytorch

I am getting
AttributeError: 'CrossEntropyLoss' object has no attribute 'dim'
This is the code block which I run and got the error
hidden_size=256
encoder1=VideoEncoderGRU(hidden_size)
encoder1=accelerator.prepare(encoder1)
decoder1 =DecoderRNN(hidden_size, vcd.lang_object.n_words).to(device)
decoder1=accelerator.prepare(decoder1)
encoder_hidden=encoder1.initHidden()
trainIters_modified(encoder1, decoder1,encoder_hidden)
A more detailed code of each layer.
trainIters_modified - It just a function that trains for a multiple number of times.
def trainIters_modified(encoder,decoder,encoder_hidden,print_every=10,plot_every=10):
start=time.time()
plot_losses=[]
print_loss_total=0 # Reset every print
plot_loss_total=0 #plot every
encoder_optimizer = optim.Adagrad(encoder.parameters())
decoder_optimizer = optim.Adagrad(decoder.parameters())
encoder_optimizer=accelerator.prepare(encoder_optimizer)
decoder_optimizer=accelerator.prepare(decoder_optimizer)
criterion = nn.CrossEntropyLoss()
for ep in range(1):
for vid,lab in train_loader:
n_iters=vid.shape[0]
for iter in range(1, n_iters + 1):
input_tensor = vid[iter-1]
target_tensor = lab[iter-1]
loss,encoder_hidden = train(input_tensor, target_tensor, encoder,decoder, encoder_optimizer, decoder_optimizer, criterion,encoder_hidden)
print_loss_total += loss
plot_loss_total += loss
if iter % print_every == 0:
print_loss_avg = print_loss_total / print_every
print_loss_total = 0
print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),iter, iter / n_iters * 100, print_loss_avg))
if iter % plot_every == 0:
plot_loss_avg = plot_loss_total / plot_every
plot_losses.append(plot_loss_avg)
plot_loss_total = 0
#break
showPlot(plot_losses)
train function
teacher_forcing_ratio=0.5
def train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,decoder_optimizer,encoder_hidden,criterion,max_length=MAX_LENGTH):
encoder_hidden=encoder_hidden
encoder_optimizer.zero_grad() #set's encoder gradients to zero
decoder_optimizer.zero_grad() #set's decoder gradients to zero
input_length = input_tensor.size(0) #no_of_wordsin*
target_length = target_tensor.size(0)
#print(f"input.shape:{input_tensor.shape},input_length:{input_length}")
#print(f"target.shape{target_tensor.shape},target_length:{target_length}")
encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
#print(f"encoder_outputs:{encoder_outputs.shape},max_length:{max_length},encoder.hidden_size={encoder.hidden_size}")
loss = 0
for ei in range(input_length):
in_tensor=torch.permute(input_tensor[ei],(2,1,0))
#print("passing input tensor.shape",in_tensor.shape) #torch.Size([1])
#print("encoder_hidden.shape:",encoder_hidden.shape) #torch.Size([1, 1, 256])
encoder_output, encoder_hidden = encoder(in_tensor, encoder_hidden) #
#print("encoder_output:",encoder_output.shape)
#print("encoder_hidden:",encoder_hidden.shape)
#the outputs are being stored in encoder_outputs matrix
encoder_outputs[ei]=encoder_output[0,0]
#after we have trained encoder for one epoch
decoder_input = torch.tensor([[SOS_token]], device=device) #SOS_token and EOS_token where defined above.
#set the encoder hidden as decoder hidden state
decoder_hidden = encoder_output
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
if use_teacher_forcing:
#teacher forcing , feed the target as next input
for di in range(target_length):
decoder_output,decoder_hidden = decoder(decoder_input,decoder_hidden)
loss += criterion(decoder_output, target_tensor[di])
decoder_input = target_tensor[di]
else:
# Without teacher forcing: use its own predictions as the next input
for di in range(target_length):
decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
#print("decoder_output.shape:",decoder_output.shape,"decoder_output:",decoder_output)
topv, topi = decoder_output.topk(1)
#print("tov,topi:",topv,topi)
decoder_input = topi.squeeze().detach() # detach from history as input
#print("decoder_input:",decoder_input.shape,"decoder_input:",decoder_input)
loss += criterion(decoder_output, target_tensor[di])
#print(loss)
if decoder_input.item() == EOS_token:
break
#loss.backward()
accelerator.backward(loss)
encoder_optimizer.step()
decoder_optimizer.step()
return (loss.item() / target_length) ,encoder_hidden
EncoderGRU
class VideoEncoderGRU(nn.Module):
def __init__(self,hidden_size):
super(VideoEncoderGRU,self).__init__()
self.vgg=vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
self.vgg.classifier=nn.Sequential(*list(self.vgg.classifier.children())[0:0])
self.vegru_classifier=nn.Sequential(
nn.Linear(512 * 7 * 7, 1024)
)
self.gru=nn.GRU(1024,hidden_size)
self.hidden_size=hidden_size
def initHidden(self):
return torch.rand(1, 1, self.hidden_size, device=device)
def forward(self,input,hidden):
out=self.vgg(input)
out=torch.reshape(out,(-1,))
out=self.vegru_classifier(out)
out=out.view(1,1,-1)
out,hidden=self.gru(out,hidden)
return out,hidden
DecoderGRU
class DecoderRNN(nn.Module):
def __init__(self, hidden_size, output_size):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size)
self.out = nn.Linear(hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, input, hidden):
output = self.embedding(input).view(1, 1, -1)
output = F.relu(output)
output, hidden = self.gru(output, hidden)
output = self.softmax(self.out(output[0]))
return output, hidden
def initHidden(self):
return torch.rand(1, 1, self.hidden_size, device=device)
My dataset consists of videos stored in a folder and csv file containing names of video along with thier captions. I think I have loaded my data correctly and convert it into a video of frame size =8 and each caption is changed to a tensor of max_length =8 containing indices of each wod in caption.
You can look at my whole source code in this notebook file
There might be a simple logical error I just knew the theory behind this problem and so I coded it
I tried to implement an LSTM encoder which takes frames of video as input. These frames are first directly passed into vgg16 network which encodes it and encoded representation is passed into each frame and final output of encoder is passed as initial hidden state to decoder along with SOS_token as first input to decoder. I wanted this model to work and predict output

Loading PyTorch Lightning Trained checkpoint

I am using PyTorch Lightning version 1.4.0 and have defined the following class for the dataset:
class CustomTrainDataset(Dataset):
'''
Custom PyTorch Dataset for training
Args:
data (pd.DataFrame) - DF containing product info (and maybe also ratings)
all_itemIds (list) - Python3 list containing all Item IDs
'''
def __init__(self, data, all_orderIds):
self.users, self.items, self.labels = self.get_dataset(data, all_orderIds)
def __len__(self):
return len(self.users)
def __getitem__(self, idx):
return self.users[idx], self.items[idx], self.labels[idx]
def get_dataset(self, data, all_orderIds):
users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['CustomerID'], train_ratings['ItemCode']))
num_negatives = 7
for u, i in user_item_set:
users.append(u)
items.append(i)
labels.append(1)
for _ in range(num_negatives):
negative_item = np.random.choice(all_itemIds)
while (u, negative_item) in user_item_set:
negative_item = np.random.choice(all_itemIds)
users.append(u)
items.append(negative_item)
labels.append(0)
return torch.tensor(users), torch.tensor(items), torch.tensor(labels)
followed by the PL class:
class NCF(pl.LightningModule):
'''
Neural Collaborative Filtering (NCF)
Args:
num_users (int): Number of unique users
num_items (int): Number of unique items
data (pd.DataFrame): Dataframe containing the food ratings for training
all_orderIds (list): List containing all orderIds (train + test)
'''
def __init__(self, num_users, num_items, data, all_itemIds):
# def __init__(self, num_users, num_items, ratings, all_movieIds):
super().__init__()
self.user_embedding = nn.Embedding(num_embeddings = num_users, embedding_dim = 8)
# self.user_embedding = nn.Embedding(num_embeddings = num_users, embedding_dim = 10)
self.item_embedding = nn.Embedding(num_embeddings = num_items, embedding_dim = 8)
# self.item_embedding = nn.Embedding(num_embeddings = num_items, embedding_dim = 10)
self.fc1 = nn.Linear(in_features = 16, out_features = 64)
# self.fc1 = nn.Linear(in_features = 20, out_features = 64)
self.fc2 = nn.Linear(in_features = 64, out_features = 64)
self.fc3 = nn.Linear(in_features = 64, out_features = 32)
self.output = nn.Linear(in_features = 32, out_features = 1)
self.data = data
# self.ratings = ratings
# self.all_movieIds = all_movieIds
self.all_orderIds = all_orderIds
def forward(self, user_input, item_input):
# Pass through embedding layers
user_embedded = self.user_embedding(user_input)
item_embedded = self.item_embedding(item_input)
# Concat the two embedding layers
vector = torch.cat([user_embedded, item_embedded], dim = -1)
# Pass through dense layer
vector = nn.ReLU()(self.fc1(vector))
vector = nn.ReLU()(self.fc2(vector))
vector = nn.ReLU()(self.fc3(vector))
# Output layer
pred = nn.Sigmoid()(self.output(vector))
return pred
def training_step(self, batch, batch_idx):
user_input, item_input, labels = batch
predicted_labels = self(user_input, item_input)
loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters())
def train_dataloader(self):
return DataLoader(
ChupsTrainDataset(
self.data, self.all_orderIds
),
batch_size = 32, num_workers = 2
# Google Colab's suggested max number of worker in current
# system is 2 and not 4.
)
print(f"num_users = {num_users}, num_items = {num_items} & all_itemIds = {len(all_itemIds)}")
# num_users = 12958, num_items = 511238 & all_itemIds = 9114
# Initialize NCF model-
model = NCF(num_users, num_items, train_ratings, all_itemIds)
trainer = pl.Trainer(
max_epochs = 75, gpus = 1,
# max_epochs = 5,
reload_dataloaders_every_n_epochs = True,
# reload_dataloaders_every_epoch = True, # deprecated!
progress_bar_refresh_rate = 50,
logger = False, checkpoint_callback = False)
trainer.fit(model)
# Save trained model as a checkpoint-
trainer.save_checkpoint("NCF_Trained.ckpt")
To load the saved checkpoint, I have tried:
trained_model = NCF.load_from_checkpoint(
"NCF_Trained.ckpt", num_users = num_users,
num_items = train_ratings, data = train_ratings,
all_itemIds = all_itemIds)
trained_model = NCF(num_users, num_items, train_ratings, all_orderIds).load_from_checkpoint(checkpoint_path = "NCF_Trained.ckpt")
But these don't seem to work. How do I load this saved checkpoint?
Thanks!
add a line in your init method:
self.save_hyperparameters(logger=False)
Then call
trained_model = NCF.load_from_checkpoint("NCF_Trained.ckpt")
As shown in here, load_from_checkpoint is a primary way to load weights in pytorch-lightning and it automatically load hyperparameter used in training. So you do not need to pass params except for overwriting existing ones. My suggestion is to try trained_model = NCF.load_from_checkpoint("NCF_Trained.ckpt")
In my case it was crucial to set the model into the evaluation mode via model.eval(). Otherwise it would produce wrong results.

GPU memory increasing at each batch (PyTorch)

I am trying to build a convolutionnal network using ConvLSTM layer (LSTM cell but with convolutions instead of matrix multiplications), but the problem is that my GPU memory increases at each batch, even if I'm deleting variables, and getting the true value for the loss (and not the graph) for each iteration. I may be doing something wrong but that exact same script ran without issues with another model (with more parameters and also using ConvLSTM layer).
Each batch is composed of num_batch x 3 images (grayscale) and I'm trying to predict the difference |Im(t+1)-Im(t)| with the input Im(t)
def main():
config = Config()
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, num_workers=0, shuffle=True, drop_last=True)
nb_img = len(train_dataset)
util.clear_progress_dir()
step_tensorboard = 0
###################################
# Model Setup #
###################################
model = fully_convLSTM()
if torch.cuda.is_available():
model = model.float().cuda()
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
util.enumerate_params([model])
###################################
# Training Loop #
###################################
model.train() #Put model in training mode
train_loss_recon = []
train_loss_recon2 = []
for epoch in tqdm(range(config.num_epochs)):
running_loss1 = 0.0
running_loss2 = 0.0
for i, (inputs, outputs) in enumerate(train_dataloader, 0):
print(i)
torch.cuda.empty_cache()
gc.collect()
# if torch.cuda.is_available():
inputs = autograd.Variable(inputs.float()).cuda()
outputs = autograd.Variable(outputs.float()).cuda()
im1 = inputs[:,0,:,:,:]
im2 = inputs[:,1,:,:,:]
im3 = inputs[:,2,:,:,:]
diff1 = torch.abs(im2 - im1).cuda().float()
diff2 = torch.abs(im3 - im2).cuda().float()
model.initialize_hidden()
optimizer.zero_grad()
pred1 = model.forward(im1)
loss = reconstruction_loss(diff1, pred1)
loss.backward()
# optimizer.step()
model.update_hidden()
optimizer.zero_grad()
pred2 = model.forward(im2)
loss2 = reconstruction_loss(diff2, pred2)
loss2.backward()
optimizer.step()
model.update_hidden()
## print statistics
running_loss1 += loss.detach().data
running_loss2 += loss2.detach().data
if i==0:
with torch.no_grad():
img_grid_diff_true = (diff2).cpu()
img_grid_diff_pred = (pred2).cpu()
f, axes = plt.subplots(2, 4, figsize=(48,48))
for l in range(4):
axes[0, l].imshow(img_grid_diff_true[l].squeeze(0).squeeze(0), cmap='gray')
axes[1, l].imshow(img_grid_diff_pred[l].squeeze(0).squeeze(0), cmap='gray')
plt.show()
plt.close()
writer_recon_loss.add_scalar('Reconstruction loss', running_loss1, step_tensorboard)
writer_recon_loss2.add_scalar('Reconstruction loss2', running_loss2, step_tensorboard)
step_tensorboard += 1
del pred1
del pred2
del im1
del im2
del im3
del diff1
del diff2#, im1_noised, im2_noised
del inputs
del outputs
del loss
del loss2
for obj in gc.get_objects():
if torch.is_tensor(obj) :
del obj
torch.cuda.empty_cache()
gc.collect()
epoch_loss = running_loss1 / len(train_dataloader.dataset)
epoch_loss2 = running_loss2/ len(train_dataloader.dataset)
print(f"Epoch {epoch} loss reconstruction1: {epoch_loss:.6f}")
print(f"Epoch {epoch} loss reconstruction2: {epoch_loss2:.6f}")
train_loss_recon.append(epoch_loss)
train_loss_recon2.append(epoch_loss2)
del running_loss1, running_loss2, epoch_loss, epoch_loss2
Here is the model used :
class ConvLSTMCell(nn.Module):
def __init__(self, input_channels, hidden_channels, kernel_size):
super(ConvLSTMCell, self).__init__()
# assert hidden_channels % 2 == 0
self.input_channels = input_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
# self.num_features = 4
self.padding = 1
self.Wxi = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whi = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxf = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whf = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxc = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Whc = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wxo = nn.Conv2d(self.input_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=True)
self.Who = nn.Conv2d(self.hidden_channels, self.hidden_channels, self.kernel_size, 1, self.padding, bias=False)
self.Wci = None
self.Wcf = None
self.Wco = None
def forward(self, x, h, c): ## Equation (3) dans Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting
ci = torch.sigmoid(self.Wxi(x) + self.Whi(h) + c * self.Wci)
cf = torch.sigmoid(self.Wxf(x) + self.Whf(h) + c * self.Wcf)
cc = cf * c + ci * torch.tanh(self.Wxc(x) + self.Whc(h)) ###gt= tanh(cc)
co = torch.sigmoid(self.Wxo(x) + self.Who(h) + cc * self.Wco) ##channel out = hidden channel
ch = co * torch.tanh(cc)
return ch, cc #short memory, long memory
def init_hidden(self, batch_size, hidden, shape):
if self.Wci is None:
self.Wci = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wcf = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
self.Wco = nn.Parameter(torch.zeros(1, hidden, shape[0], shape[1])).cuda()
else:
assert shape[0] == self.Wci.size()[2], 'Input Height Mismatched!'
assert shape[1] == self.Wci.size()[3], 'Input Width Mismatched!'
return (autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda(),
autograd.Variable(torch.zeros(batch_size, hidden, shape[0], shape[1])).cuda())
class fully_convLSTM(nn.Module):
def __init__(self):
super(fully_convLSTM, self).__init__()
layers = []
self.hidden_list = [1,32,32,1]#,32,64,32,
for k in range(len(self.hidden_list)-1): # Define blocks of [ConvLSTM,BatchNorm,Relu]
name_conv = "self.convLSTM" +str(k)
cell_conv = ConvLSTMCell(self.hidden_list[k],self.hidden_list[k+1],3)
setattr(self, name_conv, cell_conv)
name_batchnorm = "self.batchnorm"+str(k)
batchnorm=nn.BatchNorm2d(self.hidden_list[k+1])
setattr(self, name_batchnorm, batchnorm)
name_relu =" self.relu"+str(k)
relu=nn.ReLU()
setattr(self, name_relu, relu)
self.sigmoid = nn.Sigmoid()
self.internal_state=[]
def initialize_hidden(self):
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
(h,c) = getattr(self,name_conv).init_hidden(config.batch_size, self.hidden_list[k+1],(256,256))
self.internal_state.append((h,c))
self.internal_state_new=[]
def update_hidden(self):
for i, hidden in enumerate(self.internal_state_new):
self.internal_state[i] = (hidden[0].detach(), hidden[1].detach())
self.internal_state_new = []
def forward(self, input):
x = input
for k in range(len(self.hidden_list)-1):
name_conv = "self.convLSTM" +str(k)
name_batchnorm = "self.batchnorm"+str(k)
name_relu =" self.relu"+str(k)
x, c = getattr(self,name_conv)(x, self.internal_state[k][1], self.internal_state[k][0])
self.internal_state_new.append((x.detach(),c.detach()))
x = getattr(self,name_batchnorm)(x)
if k!= len(self.hidden_list)-2:
x = getattr(self,name_relu)(x)
else :
x = self.sigmoid(x)
return x
So my question is, what in my code is causing memory to accumulate during the training phase?
A few quick notes about training code:
torch.Variable is deprecated since at least 8 minor versions (see here), don't use it
gc.collect() has no point, PyTorch does the garbage collector on it's own
Don't use torch.cuda.empty_cache() for each batch, as PyTorch reserves some GPU memory (doesn't give it back to OS) so it doesn't have to allocate it for each batch once again. It will make your code slow, don't use this function at all tbh, PyTorch handles this.
Don't spam random memory cleaning, that's most probably not where the error is
Model
Yes, this is probably the case (although it's hard to read this model's code).
Take notice of self.internal_state list and self.internal_state_new list also.
Each time you call model.initialize_hidden() a new set of tensor is added to this list (and never cleaned as far as I can tell)
self.internal_state_new seems to be cleaned in update_hidden, maybe self.internal_state should be also?
In essence, check out this self.internal_state property of your model, the list grows indefinitely from what I see. Initializing with zeros everywhere is quite strange, there is probably no need to do that (e.g. PyTorch's RNN is initialized with zeros by default, this is probably similar).

Pytorch transformer forward function masks implementation for decoder forward function

I am trying to use and learn PyTorch Transformer with DeepMind math dataset. I have tokenized (char not word) sequence that is fed into model. Models forward function is doing once forward for encoder and multiple forwards for decoder (till all batch outputs reach token, this is still TODO).
I am struggling with Transformer masks and decoder forward as it throws the error:
k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
RuntimeError: shape '[-1, 24, 64]' is invalid for input of size 819200.
Source is N = 32, S = 50, E = 512. Target is N = 32, S = 3, E = 512.
It is possible that I have wrong implementation of masks or that source and target lengths are different, not realy sure.
class PositionalEncoding(nn.Module):
# function to positionally encode src and target sequencies
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class MyTransformerModel(nn.Module):
# should implement init and forward function
# define separate functions for masks
# define forward function with
# implement:
# embedding layer
# positional encoding
# encoder layer
# decoder layer
# final classification layer
# encoder -> forward once
# decoder -> forward multiple times (for one encoder forward)
# decoder output => concatenate to input e.g. decoder_input = torch.cat([decoder_input], [decoder_output])
# early stopping => all in batch reach <eos> token
def __init__(self, vocab_length = 30, sequence_length = 512, num_encoder_layers = 3, num_decoder_layers = 2, num_hidden_dimension = 256, feed_forward_dimensions = 1024, attention_heads = 8, dropout = 0.1, pad_idx = 3, device = "CPU", batch_size = 32):
super(MyTransformerModel, self).__init__()
self.src_embedding = nn.Embedding(vocab_length, sequence_length)
self.pos_encoder = PositionalEncoding(sequence_length, dropout)
self.src_mask = None # attention mask
self.memory_mask = None # attention mask
self.pad_idx = pad_idx
self.device = device
self.batch_size = batch_size
self.transformer = nn.Transformer(
sequence_length,
attention_heads,
num_encoder_layers,
num_decoder_layers,
feed_forward_dimensions,
dropout,
)
def src_att_mask(self, src_len):
mask = (torch.triu(torch.ones(src_len, src_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def no_peak_att_mask(self, batch_size, src_len, time_step):
mask = np.zeros((batch_size, src_len), dtype=bool)
mask[:, time_step: ] = 1 # np.NINF
mask = torch.from_numpy(mask)
return mask
def make_src_key_padding_mask(self, src):
# mask "<pad>"
src_mask = src.transpose(0, 1) == self.pad_idx
return src_mask.to(self.device)
def make_trg_key_padding_mask(self, trg):
tgt_mask = trg.transpose(0, 1) == self.pad_idx
return tgt_mask.to(self.device)
def forward(self, src, trg):
src_seq_length, N = src.shape
trg_seq_length, N = trg.shape
embed_src = self.src_embedding(src)
position_embed_src = self.pos_encoder(embed_src)
embed_trg = self.src_embedding(trg)
position_embed_trg = self.pos_encoder(embed_trg)
src_padding_mask = self.make_src_key_padding_mask(src)
trg_padding_mask = self.make_trg_key_padding_mask(trg)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
time_step = 1
att_mask = self.no_peak_att_mask(self.batch_size, src_seq_length, time_step).to(self.device)
encoder_output = self.transformer.encoder.forward(position_embed_src, src_key_padding_mask = src_padding_mask)
# TODO : implement loop for transformer decoder forward fn, implement early stopping
# where to feed decoder_output?
decoder_output = self.transformer.decoder.forward(position_embed_trg, encoder_output, trg_mask, att_mask, trg_padding_mask, src_padding_mask)
return decoder_output
Can anyone pin point where I have made a mistake?
It looks like I have messed dimensions order (as Transformer does not have batch first option). Corrected code is below:
class MyTransformerModel(nn.Module):
def __init__(self, d_model = 512, vocab_length = 30, sequence_length = 512, num_encoder_layers = 3, num_decoder_layers = 2, num_hidden_dimension = 256, feed_forward_dimensions = 1024, attention_heads = 8, dropout = 0.1, pad_idx = 3, device = "CPU", batch_size = 32):
#, ninp, device, nhead=8, nhid=2048, nlayers=2, dropout=0.1, src_pad_idx = 1, max_len=5000, forward_expansion= 4):
super(MyTransformerModel, self).__init__()
self.src_embedding = nn.Embedding(vocab_length, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.vocab_length = vocab_length
self.d_model = d_model
self.src_mask = None # attention mask
self.memory_mask = None # attention mask
self.pad_idx = pad_idx
self.device = device
self.batch_size = batch_size
self.transformer = nn.Transformer(
d_model,
attention_heads,
num_encoder_layers,
num_decoder_layers,
feed_forward_dimensions,
dropout,
)
self.fc = nn.Linear(d_model, vocab_length)
# self.init_weights() <= used in tutorial
def src_att_mask(self, src_len):
mask = (torch.triu(torch.ones(src_len, src_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def no_peak_att_mask(self, batch_size, src_len, time_step):
mask = np.zeros((batch_size, src_len), dtype=bool)
mask[:, time_step: ] = 1 # np.NINF
mask = torch.from_numpy(mask)
# mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def make_src_key_padding_mask(self, src):
# mask "<pad>"
src_mask = src.transpose(0, 1) == self.pad_idx
# src_mask = src == self.pad_idx
# (N, src_len)
return src_mask.to(self.device)
def make_trg_key_padding_mask(self, trg):
# same as above -> expected tgt_key_padding_mask: (N, T)
tgt_mask = trg.transpose(0, 1) == self.pad_idx
# tgt_mask = trg == self.pad_idx
# (N, src_len)
return tgt_mask.to(self.device)
def init_weights(self):
initrange = 0.1
nn.init.uniform_(self.encoder.weight, -initrange, initrange)
nn.init.zeros_(self.decoder.weight)
nn.init.uniform_(self.decoder.weight, -initrange, initrange)
def forward(self, src, trg):
N, src_seq_length = src.shape
N, trg_seq_length = trg.shape
# S - source sequence length
# T - target sequence length
# N - batch size
# E - feature number
# src: (S, N, E) (sourceLen, batch, features)
# tgt: (T, N, E)
# src_mask: (S, S)
# tgt_mask: (T, T)
# memory_mask: (T, S)
# src_key_padding_mask: (N, S)
# tgt_key_padding_mask: (N, T)
# memory_key_padding_mask: (N, S)
src = rearrange(src, 'n s -> s n')
trg = rearrange(trg, 'n t -> t n')
print("src shape {}".format(src.shape))
print(src)
print("trg shape {}".format(trg.shape))
print(trg)
embed_src = self.src_embedding(src)
print("embed_src shape {}".format(embed_src.shape))
print(embed_src)
position_embed_src = self.pos_encoder(embed_src)
print("position_embed_src shape {}".format(position_embed_src.shape))
print(position_embed_src)
embed_trg = self.src_embedding(trg)
print("embed_trg shape {}".format(embed_trg.shape))
print(embed_trg)
position_embed_trg = self.pos_encoder(embed_trg)
# position_embed_trg = position_embed_trg.transpose(0, 1)
print("position_embed_trg shape {}".format(position_embed_trg.shape))
print(position_embed_trg)
src_padding_mask = self.make_src_key_padding_mask(src)
print("KEY - src_padding_mask shape {}".format(src_padding_mask.shape))
print("should be of shape: src_key_padding_mask: (N, S)")
print(src_padding_mask)
trg_padding_mask = self.make_trg_key_padding_mask(trg)
print("KEY - trg_padding_mask shape {}".format(trg_padding_mask.shape))
print("should be of shape: trg_key_padding_mask: (N, T)")
print(trg_padding_mask)
trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
print("trg_mask shape {}".format(trg_mask.shape))
print("trg_mask should be of shape tgt_mask: (T, T)")
print(trg_mask)
# att_mask = self.src_att_mask(trg_seq_length).to(self.device)
time_step = 1
# error => memory_mask: expected shape! (T, S) !!! this is not a key_padding_mask!
# att_mask = self.no_peak_att_mask(self.batch_size, src_seq_length, time_step).to(self.device)
# print("att_mask shape {}".format(att_mask.shape))
# print("att_mask should be of shape memory_mask: (T, S)")
# print(att_mask)
att_mask = None
# get encoder output
# forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None)
# forward encoder just once for a batch
# attention forward of encoder expects => src, src_mask, src_key_padding_mask +++ possible positional encoding error !!!
encoder_output = self.transformer.encoder.forward(position_embed_src, src_key_padding_mask = src_padding_mask)
print("encoder_output")
print("encoder_output shape {}".format(encoder_output.shape))
print(encoder_output)
# forward decoder till all in batch did not reach <eos>?
# def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
# memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
# memory_key_padding_mask: Optional[Tensor] = None)
# first forward
decoder_output = self.transformer.decoder.forward(position_embed_trg, encoder_output, trg_mask, att_mask, trg_padding_mask, src_padding_mask)
# TODO: target in => target out shifted by one, loop till all in batch meet stopping criteria || max len is reached
#
print("decoder_output")
print("decoder_output shape {}".format(decoder_output.shape))
print(decoder_output)
output = rearrange(decoder_output, 't n e -> n t e')
output = self.fc(output)
print("output")
print("output shape {}".format(output.shape))
print(output)
predicted = F.log_softmax(output, dim=-1)
print("predicted")
print("predicted shape {}".format(predicted.shape))
print(predicted)
# top k
top_value, top_index = torch.topk(predicted, k=1)
top_index = torch.squeeze(top_index)
print("top_index")
print("top_index shape {}".format(top_index.shape))
print(top_index)
print("top_value")
print("top_value shape {}".format(top_value.shape))
print(top_value)
return top_index

Gradient is equal to 'None'

I have two networks. The output of the first network is the input to the other. In order to calculate the loss for the second network, I use vanilla policy gradient. I want to backpropagate this loss into the first network. After checking if the gradeints has changed, I see that they are all none.
I first load the first network (a pre-trained autoencoer in my network this way):
def load_checkpoint(filepath, model):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint['state_dict'])
for parameter in model.parameters():
parameter.requires_grad = True
model.train()
return model
Then I define the optimizers for both networks this way:
class MultipleOptimizer(object):
def __init__(self, *op):
self.optimizers = op
def zero_grad(self):
for op in self.optimizers:
op.zero_grad()
def step(self):
for op in self.optimizers:
op.step()
opt = MultipleOptimizer(SGD(model.parameters(), lr=1, momentum=0.9), Adam(logits_net.parameters(), lr=lr))
the reward function is:
#Reward function
def reward(x, act):
#print('action', act)
#print('x type', type(x))
km = KMeans(act, n_init=20, n_jobs=4)
y_pred = km.fit_predict(x.detach().cpu().numpy())# seems we can only get a centre from batch
#print('k-means output type', type(y_pred))
sil_score = sil(x.detach().cpu().numpy(), y_pred)
#print('sil score', sil_score)
return sil_score
The architecture of the second neural net and an alternative to avoid (logits=logits.mean(0)):
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network. outputs are the logits
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
class mlp2(torch.nn.Module):
def __init__(self):
super(mlp2, self).__init__()
self.linear1 = nn.Linear(10,100)
self.relu1 = nn.ReLU(inplace=True)
self.linear2 = torch.nn.Linear(100,100)
self.linear3 = torch.nn.Linear(100,20)
self.linear4 = torch.nn.Linear(2000,100)
self.ident = nn.Identity()
def forward(self, x):
a = self.linear1(x)
a = self.relu1(a)
a = self.linear2(a)
a = self.relu1(a)
a = self.linear3(a)
a = torch.flatten(a)
a = self.linear4(a)
a = self.relu1(a)
a = self.linear3(a)
out = self.ident(a)
return out
Loss is calculated as in the following order:
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits.mean(0))
def get_action(obs):
return get_policy(obs).sample().item()
def Logp(obs, act):
logp = get_policy(obs).log_prob(act.cuda())
return logp
def compute_loss(logp, weights):
return -(logp * weights).mean()
def train_one_epoch():
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for R(tau) weighting in policy gradient
batch_logp = []
# reset episode-specific variables
j = 1 # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
for i, data in enumerate(train_loader):
#Create the mean image out of those 100 images
x, label = data
x = model(x.cuda())#torch.Size([100, 10])
obs = x.data.cpu().numpy()#[100, 10] - a trajectory with only one state
# Save obs
batch_obs.append(obs.copy())
#act in the environment
#act = get_action(torch.as_tensor(obs, dtype=torch.float32))
act = get_action(x)
print('action type', type(act))
#log probability
#logp = Logp(torch.as_tensor(obs, dtype=torch.float32),act = torch.as_tensor(act, dtype=torch.int32))
logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32))
#rew = reward(obs, act+2)
rew = reward(x, act+2)
# save action, reward
batch_acts.append(act)
batch_weights.append(rew)#episode rewards
batch_logp.append(logp)
opt.zero_grad()
batch_logp = torch.stack(batch_logp, dim=0)
batch_loss = compute_loss(logp = torch.as_tensor(batch_logp, dtype=torch.float32),
weights = torch.as_tensor(batch_weights, dtype=torch.float32))
batch_loss.backward() #does it return anything? gradients? print them!
opt.step()
for name, param in logits_net.named_parameters():
print(name, param.grad)
I applied some changes with the assumption that maybe recreating some of the tensors maybe the issue:
I have the output of the first network, obs, converted like obs = x.data.cpu().numpy() this and then sent to get_action function: act = get_action(torch.as_tensor(obs, dtype=torch.float32)). I changes this to act = get_action(x) so, x is sent directly to this function. Also, change arguments of logp to logp = Logp(x, act = torch.as_tensor(act, dtype=torch.int32)).
After these changes, I still get the none value for the gradient. Is there anyway possible to backpropagate the gradient when loss is calculated this way? any changes that I can apply?
any help is appreciated.

Resources